Total coverage: 64323 (5%)of 1585121
44 44 44 44 44 43 44 44 36 44 37 44 43 44 44 43 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 // SPDX-License-Identifier: GPL-2.0-or-later /* * fs/eventpoll.c (Efficient event retrieval implementation) * Copyright (C) 2001,...,2009 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #include <linux/init.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/signal.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/string.h> #include <linux/list.h> #include <linux/hash.h> #include <linux/spinlock.h> #include <linux/syscalls.h> #include <linux/rbtree.h> #include <linux/wait.h> #include <linux/eventpoll.h> #include <linux/mount.h> #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> #include <linux/uaccess.h> #include <asm/io.h> #include <asm/mman.h> #include <linux/atomic.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/compat.h> #include <linux/rculist.h> #include <net/busy_poll.h> /* * LOCKING: * There are three level of locking required by epoll : * * 1) epmutex (mutex) * 2) ep->mtx (mutex) * 3) ep->lock (rwlock) * * The acquire order is the one listed above, from 1 to 3. * We need a rwlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need * a spinlock. During the event transfer loop (from kernel to * user space) we could end up sleeping due a copy_to_user(), so * we need a lock that will allow us to sleep. This lock is a * mutex (ep->mtx). It is acquired during the event transfer loop, * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). * Then we also need a global mutex to serialize eventpoll_release_file() * and ep_free(). * This mutex is acquired by ep_free() during the epoll file * cleanup path and it is also acquired by eventpoll_release_file() * if a file has been pushed inside an epoll set and it is then * close()d without a previous call to epoll_ctl(EPOLL_CTL_DEL). * It is also acquired when inserting an epoll fd onto another epoll * fd. We do this so that we walk the epoll tree and ensure that this * insertion does not create a cycle of epoll file descriptors, which * could lead to deadlock. We need a global mutex to prevent two * simultaneous inserts (A into B and B into A) from racing and * constructing a cycle without either insert observing that it is * going to. * It is necessary to acquire multiple "ep->mtx"es at once in the * case when one epoll fd is added to another. In this case, we * always acquire the locks in the order of nesting (i.e. after * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired * before e2->mtx). Since we disallow cycles of epoll file * descriptors, this ensures that the mutexes are well-ordered. In * order to communicate this nesting to lockdep, when walking a tree * of epoll file descriptors, we use the current recursion depth as * the lockdep subkey. * It is possible to drop the "ep->mtx" and to use the global * mutex "epmutex" (together with "ep->lock") to have it working, * but having "ep->mtx" will make the interface more scalable. * Events that require holding "epmutex" are very rare, while for * normal operations the epoll private "ep->mtx" will guarantee * a better scalability. */ /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) #define EPOLLINOUT_BITS (EPOLLIN | EPOLLOUT) #define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | EPOLLERR | EPOLLHUP | \ EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) #define EP_UNACTIVE_PTR ((void *) -1L) #define EP_ITEM_COST (sizeof(struct epitem) + sizeof(struct eppoll_entry)) struct epoll_filefd { struct file *file; int fd; } __packed; /* Wait structure used by the poll hooks */ struct eppoll_entry { /* List header used to link this structure to the "struct epitem" */ struct eppoll_entry *next; /* The "base" pointer is set to the container "struct epitem" */ struct epitem *base; /* * Wait queue item that will be linked to the target file wait * queue head. */ wait_queue_entry_t wait; /* The wait queue head that linked the "wait" wait queue item */ wait_queue_head_t *whead; }; /* * Each file descriptor added to the eventpoll interface will * have an entry of this type linked to the "rbr" RB tree. * Avoid increasing the size of this struct, there can be many thousands * of these on a server and we do not want this to take another cache line. */ struct epitem { union { /* RB tree node links this structure to the eventpoll RB tree */ struct rb_node rbn; /* Used to free the struct epitem */ struct rcu_head rcu; }; /* List header used to link this structure to the eventpoll ready list */ struct list_head rdllink; /* * Works together "struct eventpoll"->ovflist in keeping the * single linked chain of items. */ struct epitem *next; /* The file descriptor information this item refers to */ struct epoll_filefd ffd; /* List containing poll wait queues */ struct eppoll_entry *pwqlist; /* The "container" of this item */ struct eventpoll *ep; /* List header used to link this item to the "struct file" items list */ struct hlist_node fllink; /* wakeup_source used when EPOLLWAKEUP is set */ struct wakeup_source __rcu *ws; /* The structure that describe the interested events and the source fd */ struct epoll_event event; }; /* * This structure is stored inside the "private_data" member of the file * structure and represents the main data structure for the eventpoll * interface. */ struct eventpoll { /* * This mutex is used to ensure that files are not removed * while epoll is using them. This is held during the event * collection loop, the file cleanup path, the epoll file exit * code and the ctl operations. */ struct mutex mtx; /* Wait queue used by sys_epoll_wait() */ wait_queue_head_t wq; /* Wait queue used by file->poll() */ wait_queue_head_t poll_wait; /* List of ready file descriptors */ struct list_head rdllist; /* Lock which protects rdllist and ovflist */ rwlock_t lock; /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that * happened while transferring ready events to userspace w/out * holding ->lock. */ struct epitem *ovflist; /* wakeup_source used when ep_scan_ready_list is running */ struct wakeup_source *ws; /* The user that created the eventpoll descriptor */ struct user_struct *user; struct file *file; /* used to optimize loop detection check */ u64 gen; struct hlist_head refs; #ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ unsigned int napi_id; #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC /* tracks wakeup nests for lockdep validation */ u8 nests; #endif }; /* Wrapper struct used by poll queueing */ struct ep_pqueue { poll_table pt; struct epitem *epi; }; /* * Configuration options available inside /proc/sys/fs/epoll/ */ /* Maximum number of epoll watched descriptors, per user */ static long max_user_watches __read_mostly; /* * This mutex is used to serialize ep_free() and eventpoll_release_file(). */ static DEFINE_MUTEX(epmutex); static u64 loop_check_gen = 0; /* Used to check for epoll file descriptor inclusion loops */ static struct eventpoll *inserting_into; /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly; /* * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epmutex. */ struct epitems_head { struct hlist_head epitems; struct epitems_head *next; }; static struct epitems_head *tfile_check_list = EP_UNACTIVE_PTR; static struct kmem_cache *ephead_cache __read_mostly; static inline void free_ephead(struct epitems_head *head) { if (head) kmem_cache_free(ephead_cache, head); } static void list_file(struct file *file) { struct epitems_head *head; head = container_of(file->f_ep, struct epitems_head, epitems); if (!head->next) { head->next = tfile_check_list; tfile_check_list = head; } } static void unlist_file(struct epitems_head *head) { struct epitems_head *to_free = head; struct hlist_node *p = rcu_dereference(hlist_first_rcu(&head->epitems)); if (p) { struct epitem *epi= container_of(p, struct epitem, fllink); spin_lock(&epi->ffd.file->f_lock); if (!hlist_empty(&head->epitems)) to_free = NULL; head->next = NULL; spin_unlock(&epi->ffd.file->f_lock); } free_ephead(to_free); } #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> static long long_zero; static long long_max = LONG_MAX; struct ctl_table epoll_table[] = { { .procname = "max_user_watches", .data = &max_user_watches, .maxlen = sizeof(max_user_watches), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra1 = &long_zero, .extra2 = &long_max, }, { } }; #endif /* CONFIG_SYSCTL */ static const struct file_operations eventpoll_fops; static inline int is_file_epoll(struct file *f) { return f->f_op == &eventpoll_fops; } /* Setup the structure that is used as key for the RB tree */ static inline void ep_set_ffd(struct epoll_filefd *ffd, struct file *file, int fd) { ffd->file = file; ffd->fd = fd; } /* Compare RB tree keys */ static inline int ep_cmp_ffd(struct epoll_filefd *p1, struct epoll_filefd *p2) { return (p1->file > p2->file ? +1: (p1->file < p2->file ? -1 : p1->fd - p2->fd)); } /* Tells us if the item is currently linked */ static inline int ep_is_linked(struct epitem *epi) { return !list_empty(&epi->rdllink); } static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait); } /* Get the "struct epitem" from a wait queue pointer */ static inline struct epitem *ep_item_from_wait(wait_queue_entry_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } /** * ep_events_available - Checks if ready events might be available. * * @ep: Pointer to the eventpoll context. * * Return: a value different than %zero if ready events are available, * or %zero otherwise. */ static inline int ep_events_available(struct eventpoll *ep) { return !list_empty_careful(&ep->rdllist) || READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR; } #ifdef CONFIG_NET_RX_BUSY_POLL static bool ep_busy_loop_end(void *p, unsigned long start_time) { struct eventpoll *ep = p; return ep_events_available(ep) || busy_loop_timeout(start_time); } /* * Busy poll if globally on and supporting sockets found && no events, * busy loop will return if need_resched or ep_events_available. * * we must do our busy polling with irqs enabled */ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) { unsigned int napi_id = READ_ONCE(ep->napi_id); if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) { napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false, BUSY_POLL_BUDGET); if (ep_events_available(ep)) return true; /* * Busy poll timed out. Drop NAPI ID for now, we can add * it back in when we have moved a socket with a valid NAPI * ID onto the ready list. */ ep->napi_id = 0; return false; } return false; } /* * Set epoll busy poll NAPI ID from sk. */ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { struct eventpoll *ep; unsigned int napi_id; struct socket *sock; struct sock *sk; if (!net_busy_loop_on()) return; sock = sock_from_file(epi->ffd.file); if (!sock) return; sk = sock->sk; if (!sk) return; napi_id = READ_ONCE(sk->sk_napi_id); ep = epi->ep; /* Non-NAPI IDs can be rejected * or * Nothing to do if we already have this ID */ if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ ep->napi_id = napi_id; } #else static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) { return false; } static inline void ep_set_busy_poll_napi_id(struct epitem *epi) { } #endif /* CONFIG_NET_RX_BUSY_POLL */ /* * As described in commit 0ccf831cb lockdep: annotate epoll * the use of wait queues used by epoll is done in a very controlled * manner. Wake ups can nest inside each other, but are never done * with the same locking. For example: * * dfd = socket(...); * efd1 = epoll_create(); * efd2 = epoll_create(); * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...); * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...); * * When a packet arrives to the device underneath "dfd", the net code will * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a * callback wakeup entry on that queue, and the wake_up() performed by the * "dfd" net code will end up in ep_poll_callback(). At this point epoll * (efd1) notices that it may have some event ready, so it needs to wake up * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake() * that ends up in another wake_up(), after having checked about the * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to * avoid stack blasting. * * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, unsigned pollflags) { struct eventpoll *ep_src; unsigned long flags; u8 nests = 0; /* * To set the subclass or nesting level for spin_lock_irqsave_nested() * it might be natural to create a per-cpu nest count. However, since * we can recurse on ep->poll_wait.lock, and a non-raw spinlock can * schedule() in the -rt kernel, the per-cpu variable are no longer * protected. Thus, we are introducing a per eventpoll nest field. * If we are not being call from ep_poll_callback(), epi is NULL and * we are at the first level of nesting, 0. Otherwise, we are being * called from ep_poll_callback() and if a previous wakeup source is * not an epoll file itself, we are at depth 1 since the wakeup source * is depth 0. If the wakeup source is a previous epoll file in the * wakeup chain then we use its nests value and record ours as * nests + 1. The previous epoll file nests value is stable since its * already holding its own poll_wait.lock. */ if (epi) { if ((is_file_epoll(epi->ffd.file))) { ep_src = epi->ffd.file->private_data; nests = ep_src->nests; } else { nests = 1; } } spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); ep->nests = nests + 1; wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); ep->nests = 0; spin_unlock_irqrestore(&ep->poll_wait.lock, flags); } #else static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, unsigned pollflags) { wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); } #endif static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; rcu_read_lock(); /* * If it is cleared by POLLFREE, it should be rcu-safe. * If we read NULL we need a barrier paired with * smp_store_release() in ep_poll_callback(), otherwise * we rely on whead->lock. */ whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); } /* * This function unregisters poll callbacks from the associated file * descriptor. Must be called with "mtx" held (or "epmutex" if called from * ep_free). */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { struct eppoll_entry **p = &epi->pwqlist; struct eppoll_entry *pwq; while ((pwq = *p) != NULL) { *p = pwq->next; ep_remove_wait_queue(pwq); kmem_cache_free(pwq_cache, pwq); } } /* call only when ep->mtx is held */ static inline struct wakeup_source *ep_wakeup_source(struct epitem *epi) { return rcu_dereference_check(epi->ws, lockdep_is_held(&epi->ep->mtx)); } /* call only when ep->mtx is held */ static inline void ep_pm_stay_awake(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); if (ws) __pm_stay_awake(ws); } static inline bool ep_has_wakeup_source(struct epitem *epi) { return rcu_access_pointer(epi->ws) ? true : false; } /* call when ep->mtx cannot be held (ep_poll_callback) */ static inline void ep_pm_stay_awake_rcu(struct epitem *epi) { struct wakeup_source *ws; rcu_read_lock(); ws = rcu_dereference(epi->ws); if (ws) __pm_stay_awake(ws); rcu_read_unlock(); } /* * ep->mutex needs to be held because we could be hit by * eventpoll_release_file() and epoll_ctl(). */ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) { /* * Steal the ready list, and re-init the original one to the * empty list. Also, set ep->ovflist to NULL so that events * happening while looping w/out locks, are not lost. We cannot * have the poll callback to queue directly on ep->rdllist, * because we want the "sproc" callback to be able to do it * in a lockless way. */ lockdep_assert_irqs_enabled(); write_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); write_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, struct list_head *txlist) { struct epitem *epi, *nepi; write_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. * We re-insert them inside the main ready-list here. */ for (nepi = READ_ONCE(ep->ovflist); (epi = nepi) != NULL; nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { /* * We need to check if the item is already in the list. * During the "sproc" callback execution time, items are * queued into ->ovflist but the "txlist" might already * contain them, and the list_splice() below takes care of them. */ if (!ep_is_linked(epi)) { /* * ->ovflist is LIFO, so we have to reverse it in order * to keep in FIFO. */ list_add(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside * ep->rdllist. */ WRITE_ONCE(ep->ovflist, EP_UNACTIVE_PTR); /* * Quickly re-inject items left on "txlist". */ list_splice(txlist, &ep->rdllist); __pm_relax(ep->ws); if (!list_empty(&ep->rdllist)) { if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); } write_unlock_irq(&ep->lock); } static void epi_rcu_free(struct rcu_head *head) { struct epitem *epi = container_of(head, struct epitem, rcu); kmem_cache_free(epi_cache, epi); } /* * Removes a "struct epitem" from the eventpoll RB tree and deallocates * all the associated resources. Must be called with "mtx" held. */ static int ep_remove(struct eventpoll *ep, struct epitem *epi) { struct file *file = epi->ffd.file; struct epitems_head *to_free; struct hlist_head *head; lockdep_assert_irqs_enabled(); /* * Removes poll wait queue hooks. */ ep_unregister_pollwait(ep, epi); /* Remove the current item from the list of epoll hooks */ spin_lock(&file->f_lock); to_free = NULL; head = file->f_ep; if (head->first == &epi->fllink && !epi->fllink.next) { /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, NULL); if (!is_file_epoll(file)) { struct epitems_head *v; v = container_of(head, struct epitems_head, epitems); if (!smp_load_acquire(&v->next)) to_free = v; } } hlist_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); free_ephead(to_free); rb_erase_cached(&epi->rbn, &ep->rbr); write_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); write_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* * At this point it is safe to free the eventpoll item. Use the union * field epi->rcu, since we are trying to minimize the size of * 'struct epitem'. The 'rbn' field is no longer in use. Protected by * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make * use of the rbn field. */ call_rcu(&epi->rcu, epi_rcu_free); percpu_counter_dec(&ep->user->epoll_watches); return 0; } static void ep_free(struct eventpoll *ep) { struct rb_node *rbp; struct epitem *epi; /* We need to release all tasks waiting for these file */ if (waitqueue_active(&ep->poll_wait)) ep_poll_safewake(ep, NULL, 0); /* * We need to lock this because we could be hit by * eventpoll_release_file() while we're freeing the "struct eventpoll". * We do not need to hold "ep->mtx" here because the epoll file * is on the way to be removed and no one has references to it * anymore. The only hit might come from eventpoll_release_file() but * holding "epmutex" is sufficient here. */ mutex_lock(&epmutex); /* * Walks through the whole tree by unregistering poll callbacks. */ for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); cond_resched(); } /* * Walks through the whole tree by freeing each "struct epitem". At this * point we are sure no poll callbacks will be lingering around, and also by * holding "epmutex" we can be sure that no file cleanup code will hit * us during this operation. So we can avoid the lock on "ep->lock". * We do not need to lock ep->mtx, either, we only do it to prevent * a lockdep warning. */ mutex_lock(&ep->mtx); while ((rbp = rb_first_cached(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); cond_resched(); } mutex_unlock(&ep->mtx); mutex_unlock(&epmutex); mutex_destroy(&ep->mtx); free_uid(ep->user); wakeup_source_unregister(ep->ws); kfree(ep); } static int ep_eventpoll_release(struct inode *inode, struct file *file) { struct eventpoll *ep = file->private_data; if (ep) ep_free(ep); return 0; } static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth); static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int depth) { struct eventpoll *ep = file->private_data; LIST_HEAD(txlist); struct epitem *epi, *tmp; poll_table pt; __poll_t res = 0; init_poll_funcptr(&pt, NULL); /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside * the ready list. */ mutex_lock_nested(&ep->mtx, depth); ep_start_scan(ep, &txlist); list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { if (ep_item_poll(epi, &pt, depth + 1)) { res = EPOLLIN | EPOLLRDNORM; break; } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ __pm_relax(ep_wakeup_source(epi)); list_del_init(&epi->rdllink); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } /* * The ffd.file pointer may be in the process of being torn down due to * being closed, but we may not have finished eventpoll_release() yet. * * Normally, even with the atomic_long_inc_not_zero, the file may have * been free'd and then gotten re-allocated to something else (since * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU). * * But for epoll, users hold the ep->mtx mutex, and as such any file in * the process of being free'd will block in eventpoll_release_file() * and thus the underlying file allocation will not be free'd, and the * file re-use cannot happen. * * For the same reason we can avoid a rcu_read_lock() around the * operation - 'ffd.file' cannot go away even if the refcount has * reached zero (but we must still not call out to ->poll() functions * etc). */ static struct file *epi_fget(const struct epitem *epi) { struct file *file; file = epi->ffd.file; if (!atomic_long_inc_not_zero(&file->f_count)) file = NULL; return file; } /* * Differs from ep_eventpoll_poll() in that internal callers already have * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() * is correctly annotated. */ static __poll_t ep_item_poll(const struct epitem *epi, poll_table *pt, int depth) { struct file *file = epi_fget(epi); __poll_t res; /* * We could return EPOLLERR | EPOLLHUP or something, but let's * treat this more as "file doesn't exist, poll didn't happen". */ if (!file) return 0; pt->_key = epi->event.events; if (!is_file_epoll(file)) res = vfs_poll(file, pt); else res = __ep_eventpoll_poll(file, pt, depth); fput(file); return res & epi->event.events; } static __poll_t ep_eventpoll_poll(struct file *file, poll_table *wait) { return __ep_eventpoll_poll(file, wait, 0); } #ifdef CONFIG_PROC_FS static void ep_show_fdinfo(struct seq_file *m, struct file *f) { struct eventpoll *ep = f->private_data; struct rb_node *rbp; mutex_lock(&ep->mtx); for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); struct inode *inode = file_inode(epi->ffd.file); seq_printf(m, "tfd: %8d events: %8x data: %16llx " " pos:%lli ino:%lx sdev:%x\n", epi->ffd.fd, epi->event.events, (long long)epi->event.data, (long long)epi->ffd.file->f_pos, inode->i_ino, inode->i_sb->s_dev); if (seq_has_overflowed(m)) break; } mutex_unlock(&ep->mtx); } #endif /* File callbacks that implement the eventpoll file behaviour */ static const struct file_operations eventpoll_fops = { #ifdef CONFIG_PROC_FS .show_fdinfo = ep_show_fdinfo, #endif .release = ep_eventpoll_release, .poll = ep_eventpoll_poll, .llseek = noop_llseek, }; /* * This is called from eventpoll_release() to unlink files from the eventpoll * interface. We need to have this facility to cleanup correctly files that are * closed without being removed from the eventpoll interface. */ void eventpoll_release_file(struct file *file) { struct eventpoll *ep; struct epitem *epi; struct hlist_node *next; /* * We don't want to get "file->f_lock" because it is not * necessary. It is not necessary because we're in the "struct file" * cleanup path, and this means that no one is using this file anymore. * So, for example, epoll_ctl() cannot hit here since if we reach this * point, the file counter already went to zero and fget() would fail. * The only hit might come from ep_free() but by holding the mutex * will correctly serialize the operation. We do need to acquire * "ep->mtx" after "epmutex" because ep_remove() requires it when called * from anywhere but ep_free(). * * Besides, ep_remove() acquires the lock, so we can't hold it here. */ mutex_lock(&epmutex); if (unlikely(!file->f_ep)) { mutex_unlock(&epmutex); return; } hlist_for_each_entry_safe(epi, next, file->f_ep, fllink) { ep = epi->ep; mutex_lock_nested(&ep->mtx, 0); ep_remove(ep, epi); mutex_unlock(&ep->mtx); } mutex_unlock(&epmutex); } static int ep_alloc(struct eventpoll **pep) { int error; struct user_struct *user; struct eventpoll *ep; user = get_current_user(); error = -ENOMEM; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) goto free_uid; mutex_init(&ep->mtx); rwlock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = user; *pep = ep; return 0; free_uid: free_uid(user); return error; } /* * Search the file inside the eventpoll tree. The RB tree operations * are protected by the "mtx" mutex, and ep_find() must be called with * "mtx" held. */ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) { int kcmp; struct rb_node *rbp; struct epitem *epi, *epir = NULL; struct epoll_filefd ffd; ep_set_ffd(&ffd, file, fd); for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) rbp = rbp->rb_right; else if (kcmp < 0) rbp = rbp->rb_left; else { epir = epi; break; } } return epir; } #ifdef CONFIG_KCMP static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) { struct rb_node *rbp; struct epitem *epi; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (epi->ffd.fd == tfd) { if (toff == 0) return epi; else toff--; } cond_resched(); } return NULL; } struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff) { struct file *file_raw; struct eventpoll *ep; struct epitem *epi; if (!is_file_epoll(file)) return ERR_PTR(-EINVAL); ep = file->private_data; mutex_lock(&ep->mtx); epi = ep_find_tfd(ep, tfd, toff); if (epi) file_raw = epi->ffd.file; else file_raw = ERR_PTR(-ENOENT); mutex_unlock(&ep->mtx); return file_raw; } #endif /* CONFIG_KCMP */ /* * Adds a new entry to the tail of the list in a lockless way, i.e. * multiple CPUs are allowed to call this function concurrently. * * Beware: it is necessary to prevent any other modifications of the * existing list until all changes are completed, in other words * concurrent list_add_tail_lockless() calls should be protected * with a read lock, where write lock acts as a barrier which * makes sure all list_add_tail_lockless() calls are fully * completed. * * Also an element can be locklessly added to the list only in one * direction i.e. either to the tail or to the head, otherwise * concurrent access will corrupt the list. * * Return: %false if element has been already added to the list, %true * otherwise. */ static inline bool list_add_tail_lockless(struct list_head *new, struct list_head *head) { struct list_head *prev; /* * This is simple 'new->next = head' operation, but cmpxchg() * is used in order to detect that same element has been just * added to the list from another CPU: the winner observes * new->next == new. */ if (cmpxchg(&new->next, new, head) != new) return false; /* * Initially ->next of a new element must be updated with the head * (we are inserting to the tail) and only then pointers are atomically * exchanged. XCHG guarantees memory ordering, thus ->next should be * updated before pointers are actually swapped and pointers are * swapped before prev->next is updated. */ prev = xchg(&head->prev, new); /* * It is safe to modify prev->next and new->prev, because a new element * is added only to the tail and new->next is updated before XCHG. */ prev->next = new; new->prev = prev; return true; } /* * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, * i.e. multiple CPUs are allowed to call this function concurrently. * * Return: %false if epi element has been already chained, %true otherwise. */ static inline bool chain_epi_lockless(struct epitem *epi) { struct eventpoll *ep = epi->ep; /* Fast preliminary check */ if (epi->next != EP_UNACTIVE_PTR) return false; /* Check that the same epi has not been just chained from another CPU */ if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) return false; /* Atomically exchange tail */ epi->next = xchg(&ep->ovflist, epi); return true; } /* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. * * This callback takes a read lock in order not to contend with concurrent * events from another file descriptor, thus all modifications to ->rdllist * or ->ovflist are lockless. Read lock is paired with the write lock from * ep_scan_ready_list(), which stops all list modifications and guarantees * that lists state is seen correctly. * * Another thing worth to mention is that ep_poll_callback() can be called * concurrently for the same @epi from different CPUs if poll table was inited * with several wait queues entries. Plural wakeup from different CPUs of a * single wait queue is serialized by wq.lock, but the case when multiple wait * queues are used should be detected accordingly. This is detected using * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { int pwake = 0; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; __poll_t pollflags = key_to_poll(key); unsigned long flags; int ewake = 0; read_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the * EPOLLONESHOT bit that disables the descriptor when an event is received, * until the next EPOLL_CTL_MOD will be issued. */ if (!(epi->event.events & ~EP_PRIVATE_BITS)) goto out_unlock; /* * Check the events coming with the callback. At this stage, not * every device reports the events in the "key" parameter of the * callback. We need to be able to handle both cases here, hence the * test for "key" != NULL before the event match test. */ if (pollflags && !(pollflags & epi->event.events)) goto out_unlock; /* * If we are transferring events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happen during that period of time are * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { if (chain_epi_lockless(epi)) ep_pm_stay_awake_rcu(epi); } else if (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */ if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) ep_pm_stay_awake_rcu(epi); } /* * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ if (waitqueue_active(&ep->wq)) { if ((epi->event.events & EPOLLEXCLUSIVE) && !(pollflags & POLLFREE)) { switch (pollflags & EPOLLINOUT_BITS) { case EPOLLIN: if (epi->event.events & EPOLLIN) ewake = 1; break; case EPOLLOUT: if (epi->event.events & EPOLLOUT) ewake = 1; break; case 0: ewake = 1; break; } } if (sync) wake_up_sync(&ep->wq); else wake_up(&ep->wq); } if (waitqueue_active(&ep->poll_wait)) pwake++; out_unlock: read_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); if (!(epi->event.events & EPOLLEXCLUSIVE)) ewake = 1; if (pollflags & POLLFREE) { /* * If we race with ep_remove_wait_queue() it can miss * ->whead = NULL and do another remove_wait_queue() after * us, so we can't use __remove_wait_queue(). */ list_del_init(&wait->entry); /* * ->whead != NULL protects us from the race with ep_free() * or ep_remove(), ep_remove_wait_queue() takes whead->lock * held by the caller. Once we nullify it, nothing protects * ep/epi or even wait. */ smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); } return ewake; } /* * This is the callback that is used to add our wait queue to the * target file wakeup lists. */ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, poll_table *pt) { struct ep_pqueue *epq = container_of(pt, struct ep_pqueue, pt); struct epitem *epi = epq->epi; struct eppoll_entry *pwq; if (unlikely(!epi)) // an earlier allocation has failed return; pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL); if (unlikely(!pwq)) { epq->epi = NULL; return; } init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; if (epi->event.events & EPOLLEXCLUSIVE) add_wait_queue_exclusive(whead, &pwq->wait); else add_wait_queue(whead, &pwq->wait); pwq->next = epi->pwqlist; epi->pwqlist = pwq; } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) { int kcmp; struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; struct epitem *epic; bool leftmost = true; while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); if (kcmp > 0) { p = &parent->rb_right; leftmost = false; } else p = &parent->rb_left; } rb_link_node(&epi->rbn, parent, p); rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); } #define PATH_ARR_SIZE 5 /* * These are the number paths of length 1 to 5, that we are allowing to emanate * from a single file of interest. For example, we allow 1000 paths of length * 1, to emanate from each file of interest. This essentially represents the * potential wakeup paths, which need to be limited in order to avoid massive * uncontrolled wakeup storms. The common use case should be a single ep which * is connected to n file sources. In this case each file source has 1 path * of length 1. Thus, the numbers below should be more than sufficient. These * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify * and delete can't add additional paths. Protected by the epmutex. */ static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; static int path_count[PATH_ARR_SIZE]; static int path_count_inc(int nests) { /* Allow an arbitrary number of depth 1 paths */ if (nests == 0) return 0; if (++path_count[nests] > path_limits[nests]) return -1; return 0; } static void path_count_init(void) { int i; for (i = 0; i < PATH_ARR_SIZE; i++) path_count[i] = 0; } static int reverse_path_check_proc(struct hlist_head *refs, int depth) { int error = 0; struct epitem *epi; if (depth > EP_MAX_NESTS) /* too deep nesting */ return -1; /* CTL_DEL can remove links here, but that can't increase our count */ hlist_for_each_entry_rcu(epi, refs, fllink) { struct hlist_head *refs = &epi->ep->refs; if (hlist_empty(refs)) error = path_count_inc(depth); else error = reverse_path_check_proc(refs, depth + 1); if (error != 0) break; } return error; } /** * reverse_path_check - The tfile_check_list is list of epitem_head, which have * links that are proposed to be newly added. We need to * make sure that those added links don't add too many * paths such that we will spend all our time waking up * eventpoll objects. * * Return: %zero if the proposed links don't create too many paths, * %-1 otherwise. */ static int reverse_path_check(void) { struct epitems_head *p; for (p = tfile_check_list; p != EP_UNACTIVE_PTR; p = p->next) { int error; path_count_init(); rcu_read_lock(); error = reverse_path_check_proc(&p->epitems, 0); rcu_read_unlock(); if (error) return error; } return 0; } static int ep_create_wakeup_source(struct epitem *epi) { struct name_snapshot n; struct wakeup_source *ws; if (!epi->ep->ws) { epi->ep->ws = wakeup_source_register(NULL, "eventpoll"); if (!epi->ep->ws) return -ENOMEM; } take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); ws = wakeup_source_register(NULL, n.name.name); release_dentry_name_snapshot(&n); if (!ws) return -ENOMEM; rcu_assign_pointer(epi->ws, ws); return 0; } /* rare code path, only used when EPOLL_CTL_MOD removes a wakeup source */ static noinline void ep_destroy_wakeup_source(struct epitem *epi) { struct wakeup_source *ws = ep_wakeup_source(epi); RCU_INIT_POINTER(epi->ws, NULL); /* * wait for ep_pm_stay_awake_rcu to finish, synchronize_rcu is * used internally by wakeup_source_remove, too (called by * wakeup_source_unregister), so we cannot use call_rcu */ synchronize_rcu(); wakeup_source_unregister(ws); } static int attach_epitem(struct file *file, struct epitem *epi) { struct epitems_head *to_free = NULL; struct hlist_head *head = NULL; struct eventpoll *ep = NULL; if (is_file_epoll(file)) ep = file->private_data; if (ep) { head = &ep->refs; } else if (!READ_ONCE(file->f_ep)) { allocate: to_free = kmem_cache_zalloc(ephead_cache, GFP_KERNEL); if (!to_free) return -ENOMEM; head = &to_free->epitems; } spin_lock(&file->f_lock); if (!file->f_ep) { if (unlikely(!head)) { spin_unlock(&file->f_lock); goto allocate; } /* See eventpoll_release() for details. */ WRITE_ONCE(file->f_ep, head); to_free = NULL; } hlist_add_head_rcu(&epi->fllink, file->f_ep); spin_unlock(&file->f_lock); free_ephead(to_free); return 0; } /* * Must be called with "mtx" held. */ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, struct file *tfile, int fd, int full_check) { int error, pwake = 0; __poll_t revents; struct epitem *epi; struct ep_pqueue epq; struct eventpoll *tep = NULL; if (is_file_epoll(tfile)) tep = tfile->private_data; lockdep_assert_irqs_enabled(); if (unlikely(percpu_counter_compare(&ep->user->epoll_watches, max_user_watches) >= 0)) return -ENOSPC; percpu_counter_inc(&ep->user->epoll_watches); if (!(epi = kmem_cache_zalloc(epi_cache, GFP_KERNEL))) { percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } /* Item initialization follow here ... */ INIT_LIST_HEAD(&epi->rdllink); epi->ep = ep; ep_set_ffd(&epi->ffd, tfile, fd); epi->event = *event; epi->next = EP_UNACTIVE_PTR; if (tep) mutex_lock_nested(&tep->mtx, 1); /* Add the current item to the list of active epoll hook for this file */ if (unlikely(attach_epitem(tfile, epi) < 0)) { if (tep) mutex_unlock(&tep->mtx); kmem_cache_free(epi_cache, epi); percpu_counter_dec(&ep->user->epoll_watches); return -ENOMEM; } if (full_check && !tep) list_file(tfile); /* * Add the current item to the RB tree. All RB tree operations are * protected by "mtx", and ep_insert() is called with "mtx" held. */ ep_rbtree_insert(ep, epi); if (tep) mutex_unlock(&tep->mtx); /* now check if we've created too many backpaths */ if (unlikely(full_check && reverse_path_check())) { ep_remove(ep, epi); return -EINVAL; } if (epi->event.events & EPOLLWAKEUP) { error = ep_create_wakeup_source(epi); if (error) { ep_remove(ep, epi); return error; } } /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); /* * Attach the item to the poll hooks and get current event bits. * We can safely use the file* here because its usage count has * been increased by the caller of this function. Note that after * this operation completes, the poll callback can start hitting * the new item. */ revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue * install process. Namely an allocation for a wait queue failed due * high memory pressure. */ if (unlikely(!epq.epi)) { ep_remove(ep, epi); return -ENOMEM; } /* We have to drop the new item inside our item list to keep track of it */ write_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); /* If the file is already "ready" we drop it inside the ready list */ if (revents && !ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irq(&ep->lock); /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } /* * Modify the interest event mask by dropping an event if the new mask * has a match in the current file status. Must be called with "mtx" held. */ static int ep_modify(struct eventpoll *ep, struct epitem *epi, const struct epoll_event *event) { int pwake = 0; poll_table pt; lockdep_assert_irqs_enabled(); init_poll_funcptr(&pt, NULL); /* * Set the new event interest mask before calling f_op->poll(); * otherwise we might miss an event that happens between the * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; /* need barrier below */ epi->event.data = event->data; /* protected by mtx */ if (epi->event.events & EPOLLWAKEUP) { if (!ep_has_wakeup_source(epi)) ep_create_wakeup_source(epi); } else if (ep_has_wakeup_source(epi)) { ep_destroy_wakeup_source(epi); } /* * The following barrier has two effects: * * 1) Flush epi changes above to other CPUs. This ensures * we do not miss events from ep_poll_callback if an * event occurs immediately after we call f_op->poll(). * We need this because we did not take ep->lock while * changing epi above (but ep_poll_callback does take * ep->lock). * * 2) We also need to ensure we do not miss _past_ events * when calling f_op->poll(). This barrier also * pairs with the barrier in wq_has_sleeper (see * comments for wq_has_sleeper). * * This barrier will now guarantee ep_poll_callback or f_op->poll * (or both) will notice the readiness of an item. */ smp_mb(); /* * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { write_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); /* Notify waiting tasks that events are available */ if (waitqueue_active(&ep->wq)) wake_up(&ep->wq); if (waitqueue_active(&ep->poll_wait)) pwake++; } write_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ if (pwake) ep_poll_safewake(ep, NULL, 0); return 0; } static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, int maxevents) { struct epitem *epi, *tmp; LIST_HEAD(txlist); poll_table pt; int res = 0; /* * Always short-circuit for fatal signals to allow threads to make a * timely exit without the chance of finding more events available and * fetching repeatedly. */ if (fatal_signal_pending(current)) return -EINTR; init_poll_funcptr(&pt, NULL); mutex_lock(&ep->mtx); ep_start_scan(ep, &txlist); /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop we are holding ep->mtx. */ list_for_each_entry_safe(epi, tmp, &txlist, rdllink) { struct wakeup_source *ws; __poll_t revents; if (res >= maxevents) break; /* * Activate ep->ws before deactivating epi->ws to prevent * triggering auto-suspend here (in case we reactive epi->ws * below). * * This could be rearranged to delay the deactivation of epi->ws * instead, but then epi->ws would temporarily be out of sync * with ep_is_linked(). */ ws = ep_wakeup_source(epi); if (ws) { if (ws->active) __pm_stay_awake(ep->ws); __pm_relax(ws); } list_del_init(&epi->rdllink); /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, we are holding ep->mtx, * so no operations coming from userspace can change the item. */ revents = ep_item_poll(epi, &pt, 1); if (!revents) continue; events = epoll_put_uevent(revents, epi->event.data, events); if (!events) { list_add(&epi->rdllink, &txlist); ep_pm_stay_awake(epi); if (!res) res = -EFAULT; break; } res++; if (epi->event.events & EPOLLONESHOT) epi->event.events &= EP_PRIVATE_BITS; else if (!(epi->event.events & EPOLLET)) { /* * If this file has been added with Level * Trigger mode, we need to insert back inside * the ready list, so that the next call to * epoll_wait() will check again the events * availability. At this point, no one can insert * into ep->rdllist besides us. The epoll_ctl() * callers are locked out by * ep_scan_ready_list() holding "mtx" and the * poll callback will queue them in ep->ovflist. */ list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); } } ep_done_scan(ep, &txlist); mutex_unlock(&ep->mtx); return res; } static struct timespec64 *ep_timeout_to_timespec(struct timespec64 *to, long ms) { struct timespec64 now; if (ms < 0) return NULL; if (!ms) { to->tv_sec = 0; to->tv_nsec = 0; return to; } to->tv_sec = ms / MSEC_PER_SEC; to->tv_nsec = NSEC_PER_MSEC * (ms % MSEC_PER_SEC); ktime_get_ts64(&now); *to = timespec64_add_safe(now, *to); return to; } /* * autoremove_wake_function, but remove even on failure to wake up, because we * know that default_wake_function/ttwu will only fail if the thread is already * woken, and in that case the ep_poll loop will remove the entry anyways, not * try to reuse it. */ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, int sync, void *key) { int ret = default_wake_function(wq_entry, mode, sync, key); /* * Pairs with list_empty_careful in ep_poll, and ensures future loop * iterations see the cause of this wakeup. */ list_del_init_careful(&wq_entry->entry); return ret; } /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. * * @ep: Pointer to the eventpoll context. * @events: Pointer to the userspace buffer where the ready events should be * stored. * @maxevents: Size (in terms of number of events) of the caller event buffer. * @timeout: Maximum timeout for the ready events fetch operation, in * timespec. If the timeout is zero, the function will not block, * while if the @timeout ptr is NULL, the function will block * until at least one event has been retrieved (or an error * occurred). * * Return: the number of ready events which have been fetched, or an * error code, in case of error. */ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout) { int res, eavail, timed_out = 0; u64 slack = 0; wait_queue_entry_t wait; ktime_t expires, *to = NULL; lockdep_assert_irqs_enabled(); if (timeout && (timeout->tv_sec | timeout->tv_nsec)) { slack = select_estimate_accuracy(timeout); to = &expires; *to = timespec64_to_ktime(*timeout); } else if (timeout) { /* * Avoid the unnecessary trip to the wait queue loop, if the * caller specified a non blocking operation. */ timed_out = 1; } /* * This call is racy: We may or may not see events that are being added * to the ready list under the lock (e.g., in IRQ callbacks). For cases * with a non-zero timeout, this thread will check the ready list under * lock and will add to the wait queue. For cases with a zero * timeout, the user by definition should not care and will have to * recheck again. */ eavail = ep_events_available(ep); while (1) { if (eavail) { /* * Try to transfer events to user space. In case we get * 0 events and there's still timeout left over, we go * trying again in search of more luck. */ res = ep_send_events(ep, events, maxevents); if (res) return res; } if (timed_out) return 0; eavail = ep_busy_loop(ep, timed_out); if (eavail) continue; if (signal_pending(current)) return -EINTR; /* * Internally init_wait() uses autoremove_wake_function(), * thus wait entry is removed from the wait queue on each * wakeup. Why it is important? In case of several waiters * each new wakeup will hit the next waiter, giving it the * chance to harvest new event. Otherwise wakeup can be * lost. This is also good performance-wise, because on * normal wakeup path no need to call __remove_wait_queue() * explicitly, thus ep->lock is not taken, which halts the * event delivery. * * In fact, we now use an even more aggressive function that * unconditionally removes, because we don't reuse the wait * entry between loop iterations. This lets us also avoid the * performance issue if a process is killed, causing all of its * threads to wake up without being removed normally. */ init_wait(&wait); wait.func = ep_autoremove_wake_function; write_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it * is safe to avoid an explicit barrier. */ __set_current_state(TASK_INTERRUPTIBLE); /* * Do the final check under the lock. ep_scan_ready_list() * plays with two lists (->rdllist and ->ovflist) and there * is always a race when both lists are empty for short * period of time although events are pending, so lock is * important. */ eavail = ep_events_available(ep); if (!eavail) __add_wait_queue_exclusive(&ep->wq, &wait); write_unlock_irq(&ep->lock); if (!eavail) timed_out = !schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* * We were woken up, thus go and try to harvest some events. * If timed out and still on the wait queue, recheck eavail * carefully under lock, below. */ eavail = 1; if (!list_empty_careful(&wait.entry)) { write_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its * timeout expired before it could reacquire the lock. * Thus, when wait.entry is empty, it needs to harvest * events. */ if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); write_unlock_irq(&ep->lock); } } } /** * ep_loop_check_proc - verify that adding an epoll file inside another * epoll structure does not violate the constraints, in * terms of closed loops, or too deep chains (which can * result in excessive stack usage). * * @ep: the &struct eventpoll to be currently checked. * @depth: Current depth of the path being checked. * * Return: %zero if adding the epoll @file inside current epoll * structure @ep does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check_proc(struct eventpoll *ep, int depth) { int error = 0; struct rb_node *rbp; struct epitem *epi; mutex_lock_nested(&ep->mtx, depth + 1); ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { struct eventpoll *ep_tovisit; ep_tovisit = epi->ffd.file->private_data; if (ep_tovisit->gen == loop_check_gen) continue; if (ep_tovisit == inserting_into || depth > EP_MAX_NESTS) error = -1; else error = ep_loop_check_proc(ep_tovisit, depth + 1); if (error != 0) break; } else { /* * If we've reached a file that is not associated with * an ep, then we need to check if the newly added * links are going to add too many wakeup paths. We do * this by adding it to the tfile_check_list, if it's * not already there, and calling reverse_path_check() * during ep_insert(). */ list_file(epi->ffd.file); } } mutex_unlock(&ep->mtx); return error; } /** * ep_loop_check - Performs a check to verify that adding an epoll file (@to) * into another epoll file (represented by @ep) does not create * closed loops or too deep chains. * * @ep: Pointer to the epoll we are inserting into. * @to: Pointer to the epoll to be inserted. * * Return: %zero if adding the epoll @to inside the epoll @from * does not violate the constraints, or %-1 otherwise. */ static int ep_loop_check(struct eventpoll *ep, struct eventpoll *to) { inserting_into = ep; return ep_loop_check_proc(to, 0); } static void clear_tfile_check_list(void) { rcu_read_lock(); while (tfile_check_list != EP_UNACTIVE_PTR) { struct epitems_head *head = tfile_check_list; tfile_check_list = head->next; unlist_file(head); } rcu_read_unlock(); } /* * Open an eventpoll file descriptor. */ static int do_epoll_create(int flags) { int error, fd; struct eventpoll *ep = NULL; struct file *file; /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); if (flags & ~EPOLL_CLOEXEC) return -EINVAL; /* * Create the internal data structure ("struct eventpoll"). */ error = ep_alloc(&ep); if (error < 0) return error; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. */ fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); if (fd < 0) { error = fd; goto out_free_ep; } file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, O_RDWR | (flags & O_CLOEXEC)); if (IS_ERR(file)) { error = PTR_ERR(file); goto out_free_fd; } ep->file = file; fd_install(fd, file); return fd; out_free_fd: put_unused_fd(fd); out_free_ep: ep_free(ep); return error; } SYSCALL_DEFINE1(epoll_create1, int, flags) { return do_epoll_create(flags); } SYSCALL_DEFINE1(epoll_create, int, size) { if (size <= 0) return -EINVAL; return do_epoll_create(0); } static inline int epoll_mutex_lock(struct mutex *mutex, int depth, bool nonblock) { if (!nonblock) { mutex_lock_nested(mutex, depth); return 0; } if (mutex_trylock(mutex)) return 0; return -EAGAIN; } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock) { int error; int full_check = 0; struct fd f, tf; struct eventpoll *ep; struct epitem *epi; struct eventpoll *tep = NULL; error = -EBADF; f = fdget(epfd); if (!f.file) goto error_return; /* Get the "struct file *" for the target file */ tf = fdget(fd); if (!tf.file) goto error_fput; /* The target file descriptor must support poll */ error = -EPERM; if (!file_can_poll(tf.file)) goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ if (ep_op_has_event(op)) ep_take_care_of_epollwakeup(epds); /* * We have to check that the file structure underneath the file descriptor * the user passed to us _is_ an eventpoll file. And also we do not permit * adding an epoll file descriptor inside itself. */ error = -EINVAL; if (f.file == tf.file || !is_file_epoll(f.file)) goto error_tgt_fput; /* * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * Also, we do not currently supported nested exclusive wakeups. */ if (ep_op_has_event(op) && (epds->events & EPOLLEXCLUSIVE)) { if (op == EPOLL_CTL_MOD) goto error_tgt_fput; if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || (epds->events & ~EPOLLEXCLUSIVE_OK_BITS))) goto error_tgt_fput; } /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = f.file->private_data; /* * When we insert an epoll file descriptor inside another epoll file * descriptor, there is the chance of creating closed loops, which are * better be handled here, than in more critical paths. While we are * checking for loops we also determine the list of files reachable * and hang them on the tfile_check_list, so we can check that we * haven't created too many possible wakeup paths. * * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when * the epoll file descriptor is attaching directly to a wakeup source, * unless the epoll file descriptor is nested. The purpose of taking the * 'epmutex' on add is to prevent complex toplogies such as loops and * deep wakeup paths from forming in parallel through multiple * EPOLL_CTL_ADD operations. */ error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; if (op == EPOLL_CTL_ADD) { if (READ_ONCE(f.file->f_ep) || ep->gen == loop_check_gen || is_file_epoll(tf.file)) { mutex_unlock(&ep->mtx); error = epoll_mutex_lock(&epmutex, 0, nonblock); if (error) goto error_tgt_fput; loop_check_gen++; full_check = 1; if (is_file_epoll(tf.file)) { tep = tf.file->private_data; error = -ELOOP; if (ep_loop_check(ep, tep) != 0) goto error_tgt_fput; } error = epoll_mutex_lock(&ep->mtx, 0, nonblock); if (error) goto error_tgt_fput; } } /* * Try to lookup the file inside our RB tree. Since we grabbed "mtx" * above, we can be sure to be able to use the item looked up by * ep_find() till we release the mutex. */ epi = ep_find(ep, tf.file, fd); error = -EINVAL; switch (op) { case EPOLL_CTL_ADD: if (!epi) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_insert(ep, epds, tf.file, fd, full_check); } else error = -EEXIST; break; case EPOLL_CTL_DEL: if (epi) error = ep_remove(ep, epi); else error = -ENOENT; break; case EPOLL_CTL_MOD: if (epi) { if (!(epi->event.events & EPOLLEXCLUSIVE)) { epds->events |= EPOLLERR | EPOLLHUP; error = ep_modify(ep, epi, epds); } } else error = -ENOENT; break; } mutex_unlock(&ep->mtx); error_tgt_fput: if (full_check) { clear_tfile_check_list(); loop_check_gen++; mutex_unlock(&epmutex); } fdput(tf); error_fput: fdput(f); error_return: return error; } /* * The following function implements the controller interface for * the eventpoll file that enables the insertion/removal/change of * file descriptors inside the interest set. */ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epoll_event __user *, event) { struct epoll_event epds; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) return -EFAULT; return do_epoll_ctl(epfd, op, fd, &epds, false); } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). */ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { int error; struct fd f; struct eventpoll *ep; /* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; /* Verify that the area passed by the user is writeable */ if (!access_ok(events, maxevents * sizeof(struct epoll_event))) return -EFAULT; /* Get the "struct file *" for the eventpoll file */ f = fdget(epfd); if (!f.file) return -EBADF; /* * We have to check that the file structure underneath the fd * the user passed to us _is_ an eventpoll file. */ error = -EINVAL; if (!is_file_epoll(f.file)) goto error_fput; /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ ep = f.file->private_data; /* Time to fish for events ... */ error = ep_poll(ep, events, maxevents, to); error_fput: fdput(f); return error; } SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout) { struct timespec64 to; return do_epoll_wait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout)); } /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_pwait(2). */ static int do_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to, const sigset_t __user *sigmask, size_t sigsetsize) { int error; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ error = set_user_sigmask(sigmask, sigsetsize); if (error) return error; error = do_epoll_wait(epfd, events, maxevents, to); restore_saved_sigmask_unless(error == -EINTR); return error; } SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 to; return do_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const sigset_t __user *, sigmask, size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #ifdef CONFIG_COMPAT static int do_compat_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *timeout, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize) { long err; /* * If the caller wants a certain signal mask to be set during the wait, * we apply it here. */ err = set_compat_user_sigmask(sigmask, sigsetsize); if (err) return err; err = do_epoll_wait(epfd, events, maxevents, timeout); restore_saved_sigmask_unless(err == -EINTR); return err; } COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events, int, maxevents, int, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 to; return do_compat_epoll_pwait(epfd, events, maxevents, ep_timeout_to_timespec(&to, timeout), sigmask, sigsetsize); } COMPAT_SYSCALL_DEFINE6(epoll_pwait2, int, epfd, struct epoll_event __user *, events, int, maxevents, const struct __kernel_timespec __user *, timeout, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { struct timespec64 ts, *to = NULL; if (timeout) { if (get_timespec64(&ts, timeout)) return -EFAULT; to = &ts; if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } return do_compat_epoll_pwait(epfd, events, maxevents, to, sigmask, sigsetsize); } #endif static int __init eventpoll_init(void) { struct sysinfo si; si_meminfo(&si); /* * Allows top 4% of lomem to be allocated for epoll watches (per user). */ max_user_watches = (((si.totalram - si.totalhigh) / 25) << PAGE_SHIFT) / EP_ITEM_COST; BUG_ON(max_user_watches < 0); /* * We can have many thousands of epitems, so prevent this from * using an extra cache line on 64-bit (and smaller) CPUs */ BUILD_BUG_ON(sizeof(void *) <= 8 && sizeof(struct epitem) > 128); /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); ephead_cache = kmem_cache_create("ep_head", sizeof(struct epitems_head), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } fs_initcall(eventpoll_init);
50 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __VDSO_MATH64_H #define __VDSO_MATH64_H static __always_inline u32 __iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder) { u32 ret = 0; while (dividend >= divisor) { /* The following asm() prevents the compiler from optimising this loop into a modulo operation. */ asm("" : "+rm"(dividend)); dividend -= divisor; ret++; } *remainder = dividend; return ret; } #endif /* __VDSO_MATH64_H */
2353 1714 2215 37 1449 1316 2 251 194 268 1865 251 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-instrumented.sh // DO NOT MODIFY THIS FILE DIRECTLY /* * This file provides wrappers with KASAN instrumentation for atomic operations. * To use this functionality an arch's atomic.h file needs to define all * atomic operations with arch_ prefix (e.g. arch_atomic_read()) and include * this file at the end. This file provides atomic_read() that forwards to * arch_atomic_read() for actual atomic operation. * Note: if an arch atomic operation is implemented by means of other atomic * operations (e.g. atomic_read()/atomic_cmpxchg() loop), then it needs to use * arch_ variants (i.e. arch_atomic_read()/arch_atomic_cmpxchg()) to avoid * double instrumentation. */ #ifndef _LINUX_ATOMIC_INSTRUMENTED_H #define _LINUX_ATOMIC_INSTRUMENTED_H #include <linux/build_bug.h> #include <linux/compiler.h> #include <linux/instrumented.h> static __always_inline int atomic_read(const atomic_t *v) { instrument_atomic_read(v, sizeof(*v)); return arch_atomic_read(v); } static __always_inline int atomic_read_acquire(const atomic_t *v) { instrument_atomic_read(v, sizeof(*v)); return arch_atomic_read_acquire(v); } static __always_inline void atomic_set(atomic_t *v, int i) { instrument_atomic_write(v, sizeof(*v)); arch_atomic_set(v, i); } static __always_inline void atomic_set_release(atomic_t *v, int i) { instrument_atomic_write(v, sizeof(*v)); arch_atomic_set_release(v, i); } static __always_inline void atomic_add(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_add(i, v); } static __always_inline int atomic_add_return(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_return(i, v); } static __always_inline int atomic_add_return_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_return_acquire(i, v); } static __always_inline int atomic_add_return_release(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_return_release(i, v); } static __always_inline int atomic_add_return_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_return_relaxed(i, v); } static __always_inline int atomic_fetch_add(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add(i, v); } static __always_inline int atomic_fetch_add_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add_acquire(i, v); } static __always_inline int atomic_fetch_add_release(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add_release(i, v); } static __always_inline int atomic_fetch_add_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add_relaxed(i, v); } static __always_inline void atomic_sub(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_sub(i, v); } static __always_inline int atomic_sub_return(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_return(i, v); } static __always_inline int atomic_sub_return_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_return_acquire(i, v); } static __always_inline int atomic_sub_return_release(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_return_release(i, v); } static __always_inline int atomic_sub_return_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_return_relaxed(i, v); } static __always_inline int atomic_fetch_sub(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_sub(i, v); } static __always_inline int atomic_fetch_sub_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_sub_acquire(i, v); } static __always_inline int atomic_fetch_sub_release(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_sub_release(i, v); } static __always_inline int atomic_fetch_sub_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_sub_relaxed(i, v); } static __always_inline void atomic_inc(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_inc(v); } static __always_inline int atomic_inc_return(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_return(v); } static __always_inline int atomic_inc_return_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_return_acquire(v); } static __always_inline int atomic_inc_return_release(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_return_release(v); } static __always_inline int atomic_inc_return_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_return_relaxed(v); } static __always_inline int atomic_fetch_inc(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_inc(v); } static __always_inline int atomic_fetch_inc_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_inc_acquire(v); } static __always_inline int atomic_fetch_inc_release(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_inc_release(v); } static __always_inline int atomic_fetch_inc_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_inc_relaxed(v); } static __always_inline void atomic_dec(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_dec(v); } static __always_inline int atomic_dec_return(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_return(v); } static __always_inline int atomic_dec_return_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_return_acquire(v); } static __always_inline int atomic_dec_return_release(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_return_release(v); } static __always_inline int atomic_dec_return_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_return_relaxed(v); } static __always_inline int atomic_fetch_dec(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_dec(v); } static __always_inline int atomic_fetch_dec_acquire(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_dec_acquire(v); } static __always_inline int atomic_fetch_dec_release(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_dec_release(v); } static __always_inline int atomic_fetch_dec_relaxed(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_dec_relaxed(v); } static __always_inline void atomic_and(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_and(i, v); } static __always_inline int atomic_fetch_and(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_and(i, v); } static __always_inline int atomic_fetch_and_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_and_acquire(i, v); } static __always_inline int atomic_fetch_and_release(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_and_release(i, v); } static __always_inline int atomic_fetch_and_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_and_relaxed(i, v); } static __always_inline void atomic_andnot(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_andnot(i, v); } static __always_inline int atomic_fetch_andnot(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_andnot(i, v); } static __always_inline int atomic_fetch_andnot_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_andnot_acquire(i, v); } static __always_inline int atomic_fetch_andnot_release(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_andnot_release(i, v); } static __always_inline int atomic_fetch_andnot_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_andnot_relaxed(i, v); } static __always_inline void atomic_or(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_or(i, v); } static __always_inline int atomic_fetch_or(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_or(i, v); } static __always_inline int atomic_fetch_or_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_or_acquire(i, v); } static __always_inline int atomic_fetch_or_release(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_or_release(i, v); } static __always_inline int atomic_fetch_or_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_or_relaxed(i, v); } static __always_inline void atomic_xor(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_xor(i, v); } static __always_inline int atomic_fetch_xor(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_xor(i, v); } static __always_inline int atomic_fetch_xor_acquire(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_xor_acquire(i, v); } static __always_inline int atomic_fetch_xor_release(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_xor_release(i, v); } static __always_inline int atomic_fetch_xor_relaxed(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_xor_relaxed(i, v); } static __always_inline int atomic_xchg(atomic_t *v, int i) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_xchg(v, i); } static __always_inline int atomic_xchg_acquire(atomic_t *v, int i) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_xchg_acquire(v, i); } static __always_inline int atomic_xchg_release(atomic_t *v, int i) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_xchg_release(v, i); } static __always_inline int atomic_xchg_relaxed(atomic_t *v, int i) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_xchg_relaxed(v, i); } static __always_inline int atomic_cmpxchg(atomic_t *v, int old, int new) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_cmpxchg(v, old, new); } static __always_inline int atomic_cmpxchg_acquire(atomic_t *v, int old, int new) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_cmpxchg_acquire(v, old, new); } static __always_inline int atomic_cmpxchg_release(atomic_t *v, int old, int new) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_cmpxchg_release(v, old, new); } static __always_inline int atomic_cmpxchg_relaxed(atomic_t *v, int old, int new) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_cmpxchg_relaxed(v, old, new); } static __always_inline bool atomic_try_cmpxchg(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_try_cmpxchg(v, old, new); } static __always_inline bool atomic_try_cmpxchg_acquire(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_try_cmpxchg_acquire(v, old, new); } static __always_inline bool atomic_try_cmpxchg_release(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_try_cmpxchg_release(v, old, new); } static __always_inline bool atomic_try_cmpxchg_relaxed(atomic_t *v, int *old, int new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_try_cmpxchg_relaxed(v, old, new); } static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_sub_and_test(i, v); } static __always_inline bool atomic_dec_and_test(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_and_test(v); } static __always_inline bool atomic_inc_and_test(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_and_test(v); } static __always_inline bool atomic_add_negative(int i, atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_negative(i, v); } static __always_inline int atomic_fetch_add_unless(atomic_t *v, int a, int u) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_fetch_add_unless(v, a, u); } static __always_inline bool atomic_add_unless(atomic_t *v, int a, int u) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_add_unless(v, a, u); } static __always_inline bool atomic_inc_not_zero(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_not_zero(v); } static __always_inline bool atomic_inc_unless_negative(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_inc_unless_negative(v); } static __always_inline bool atomic_dec_unless_positive(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_unless_positive(v); } static __always_inline int atomic_dec_if_positive(atomic_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_dec_if_positive(v); } static __always_inline s64 atomic64_read(const atomic64_t *v) { instrument_atomic_read(v, sizeof(*v)); return arch_atomic64_read(v); } static __always_inline s64 atomic64_read_acquire(const atomic64_t *v) { instrument_atomic_read(v, sizeof(*v)); return arch_atomic64_read_acquire(v); } static __always_inline void atomic64_set(atomic64_t *v, s64 i) { instrument_atomic_write(v, sizeof(*v)); arch_atomic64_set(v, i); } static __always_inline void atomic64_set_release(atomic64_t *v, s64 i) { instrument_atomic_write(v, sizeof(*v)); arch_atomic64_set_release(v, i); } static __always_inline void atomic64_add(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_add(i, v); } static __always_inline s64 atomic64_add_return(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_return(i, v); } static __always_inline s64 atomic64_add_return_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_return_acquire(i, v); } static __always_inline s64 atomic64_add_return_release(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_return_release(i, v); } static __always_inline s64 atomic64_add_return_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_return_relaxed(i, v); } static __always_inline s64 atomic64_fetch_add(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add(i, v); } static __always_inline s64 atomic64_fetch_add_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add_acquire(i, v); } static __always_inline s64 atomic64_fetch_add_release(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add_release(i, v); } static __always_inline s64 atomic64_fetch_add_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add_relaxed(i, v); } static __always_inline void atomic64_sub(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_sub(i, v); } static __always_inline s64 atomic64_sub_return(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_return(i, v); } static __always_inline s64 atomic64_sub_return_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_return_acquire(i, v); } static __always_inline s64 atomic64_sub_return_release(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_return_release(i, v); } static __always_inline s64 atomic64_sub_return_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_return_relaxed(i, v); } static __always_inline s64 atomic64_fetch_sub(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_sub(i, v); } static __always_inline s64 atomic64_fetch_sub_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_sub_acquire(i, v); } static __always_inline s64 atomic64_fetch_sub_release(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_sub_release(i, v); } static __always_inline s64 atomic64_fetch_sub_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_sub_relaxed(i, v); } static __always_inline void atomic64_inc(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_inc(v); } static __always_inline s64 atomic64_inc_return(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_return(v); } static __always_inline s64 atomic64_inc_return_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_return_acquire(v); } static __always_inline s64 atomic64_inc_return_release(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_return_release(v); } static __always_inline s64 atomic64_inc_return_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_return_relaxed(v); } static __always_inline s64 atomic64_fetch_inc(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_inc(v); } static __always_inline s64 atomic64_fetch_inc_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_inc_acquire(v); } static __always_inline s64 atomic64_fetch_inc_release(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_inc_release(v); } static __always_inline s64 atomic64_fetch_inc_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_inc_relaxed(v); } static __always_inline void atomic64_dec(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_dec(v); } static __always_inline s64 atomic64_dec_return(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_return(v); } static __always_inline s64 atomic64_dec_return_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_return_acquire(v); } static __always_inline s64 atomic64_dec_return_release(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_return_release(v); } static __always_inline s64 atomic64_dec_return_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_return_relaxed(v); } static __always_inline s64 atomic64_fetch_dec(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_dec(v); } static __always_inline s64 atomic64_fetch_dec_acquire(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_dec_acquire(v); } static __always_inline s64 atomic64_fetch_dec_release(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_dec_release(v); } static __always_inline s64 atomic64_fetch_dec_relaxed(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_dec_relaxed(v); } static __always_inline void atomic64_and(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_and(i, v); } static __always_inline s64 atomic64_fetch_and(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_and(i, v); } static __always_inline s64 atomic64_fetch_and_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_and_acquire(i, v); } static __always_inline s64 atomic64_fetch_and_release(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_and_release(i, v); } static __always_inline s64 atomic64_fetch_and_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_and_relaxed(i, v); } static __always_inline void atomic64_andnot(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_andnot(i, v); } static __always_inline s64 atomic64_fetch_andnot(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_andnot(i, v); } static __always_inline s64 atomic64_fetch_andnot_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_andnot_acquire(i, v); } static __always_inline s64 atomic64_fetch_andnot_release(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_andnot_release(i, v); } static __always_inline s64 atomic64_fetch_andnot_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_andnot_relaxed(i, v); } static __always_inline void atomic64_or(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_or(i, v); } static __always_inline s64 atomic64_fetch_or(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_or(i, v); } static __always_inline s64 atomic64_fetch_or_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_or_acquire(i, v); } static __always_inline s64 atomic64_fetch_or_release(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_or_release(i, v); } static __always_inline s64 atomic64_fetch_or_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_or_relaxed(i, v); } static __always_inline void atomic64_xor(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic64_xor(i, v); } static __always_inline s64 atomic64_fetch_xor(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_xor(i, v); } static __always_inline s64 atomic64_fetch_xor_acquire(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_xor_acquire(i, v); } static __always_inline s64 atomic64_fetch_xor_release(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_xor_release(i, v); } static __always_inline s64 atomic64_fetch_xor_relaxed(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_xor_relaxed(i, v); } static __always_inline s64 atomic64_xchg(atomic64_t *v, s64 i) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_xchg(v, i); } static __always_inline s64 atomic64_xchg_acquire(atomic64_t *v, s64 i) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_xchg_acquire(v, i); } static __always_inline s64 atomic64_xchg_release(atomic64_t *v, s64 i) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_xchg_release(v, i); } static __always_inline s64 atomic64_xchg_relaxed(atomic64_t *v, s64 i) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_xchg_relaxed(v, i); } static __always_inline s64 atomic64_cmpxchg(atomic64_t *v, s64 old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_cmpxchg(v, old, new); } static __always_inline s64 atomic64_cmpxchg_acquire(atomic64_t *v, s64 old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_cmpxchg_acquire(v, old, new); } static __always_inline s64 atomic64_cmpxchg_release(atomic64_t *v, s64 old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_cmpxchg_release(v, old, new); } static __always_inline s64 atomic64_cmpxchg_relaxed(atomic64_t *v, s64 old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_cmpxchg_relaxed(v, old, new); } static __always_inline bool atomic64_try_cmpxchg(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic64_try_cmpxchg(v, old, new); } static __always_inline bool atomic64_try_cmpxchg_acquire(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic64_try_cmpxchg_acquire(v, old, new); } static __always_inline bool atomic64_try_cmpxchg_release(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic64_try_cmpxchg_release(v, old, new); } static __always_inline bool atomic64_try_cmpxchg_relaxed(atomic64_t *v, s64 *old, s64 new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic64_try_cmpxchg_relaxed(v, old, new); } static __always_inline bool atomic64_sub_and_test(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_sub_and_test(i, v); } static __always_inline bool atomic64_dec_and_test(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_and_test(v); } static __always_inline bool atomic64_inc_and_test(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_and_test(v); } static __always_inline bool atomic64_add_negative(s64 i, atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_negative(i, v); } static __always_inline s64 atomic64_fetch_add_unless(atomic64_t *v, s64 a, s64 u) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_fetch_add_unless(v, a, u); } static __always_inline bool atomic64_add_unless(atomic64_t *v, s64 a, s64 u) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_add_unless(v, a, u); } static __always_inline bool atomic64_inc_not_zero(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_not_zero(v); } static __always_inline bool atomic64_inc_unless_negative(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_inc_unless_negative(v); } static __always_inline bool atomic64_dec_unless_positive(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_unless_positive(v); } static __always_inline s64 atomic64_dec_if_positive(atomic64_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic64_dec_if_positive(v); } static __always_inline long atomic_long_read(const atomic_long_t *v) { instrument_atomic_read(v, sizeof(*v)); return arch_atomic_long_read(v); } static __always_inline long atomic_long_read_acquire(const atomic_long_t *v) { instrument_atomic_read(v, sizeof(*v)); return arch_atomic_long_read_acquire(v); } static __always_inline void atomic_long_set(atomic_long_t *v, long i) { instrument_atomic_write(v, sizeof(*v)); arch_atomic_long_set(v, i); } static __always_inline void atomic_long_set_release(atomic_long_t *v, long i) { instrument_atomic_write(v, sizeof(*v)); arch_atomic_long_set_release(v, i); } static __always_inline void atomic_long_add(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_long_add(i, v); } static __always_inline long atomic_long_add_return(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_add_return(i, v); } static __always_inline long atomic_long_add_return_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_add_return_acquire(i, v); } static __always_inline long atomic_long_add_return_release(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_add_return_release(i, v); } static __always_inline long atomic_long_add_return_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_add_return_relaxed(i, v); } static __always_inline long atomic_long_fetch_add(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_add(i, v); } static __always_inline long atomic_long_fetch_add_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_add_acquire(i, v); } static __always_inline long atomic_long_fetch_add_release(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_add_release(i, v); } static __always_inline long atomic_long_fetch_add_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_add_relaxed(i, v); } static __always_inline void atomic_long_sub(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_long_sub(i, v); } static __always_inline long atomic_long_sub_return(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_sub_return(i, v); } static __always_inline long atomic_long_sub_return_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_sub_return_acquire(i, v); } static __always_inline long atomic_long_sub_return_release(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_sub_return_release(i, v); } static __always_inline long atomic_long_sub_return_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_sub_return_relaxed(i, v); } static __always_inline long atomic_long_fetch_sub(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_sub(i, v); } static __always_inline long atomic_long_fetch_sub_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_sub_acquire(i, v); } static __always_inline long atomic_long_fetch_sub_release(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_sub_release(i, v); } static __always_inline long atomic_long_fetch_sub_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_sub_relaxed(i, v); } static __always_inline void atomic_long_inc(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_long_inc(v); } static __always_inline long atomic_long_inc_return(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_inc_return(v); } static __always_inline long atomic_long_inc_return_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_inc_return_acquire(v); } static __always_inline long atomic_long_inc_return_release(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_inc_return_release(v); } static __always_inline long atomic_long_inc_return_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_inc_return_relaxed(v); } static __always_inline long atomic_long_fetch_inc(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_inc(v); } static __always_inline long atomic_long_fetch_inc_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_inc_acquire(v); } static __always_inline long atomic_long_fetch_inc_release(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_inc_release(v); } static __always_inline long atomic_long_fetch_inc_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_inc_relaxed(v); } static __always_inline void atomic_long_dec(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_long_dec(v); } static __always_inline long atomic_long_dec_return(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_dec_return(v); } static __always_inline long atomic_long_dec_return_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_dec_return_acquire(v); } static __always_inline long atomic_long_dec_return_release(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_dec_return_release(v); } static __always_inline long atomic_long_dec_return_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_dec_return_relaxed(v); } static __always_inline long atomic_long_fetch_dec(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_dec(v); } static __always_inline long atomic_long_fetch_dec_acquire(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_dec_acquire(v); } static __always_inline long atomic_long_fetch_dec_release(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_dec_release(v); } static __always_inline long atomic_long_fetch_dec_relaxed(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_dec_relaxed(v); } static __always_inline void atomic_long_and(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_long_and(i, v); } static __always_inline long atomic_long_fetch_and(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_and(i, v); } static __always_inline long atomic_long_fetch_and_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_and_acquire(i, v); } static __always_inline long atomic_long_fetch_and_release(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_and_release(i, v); } static __always_inline long atomic_long_fetch_and_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_and_relaxed(i, v); } static __always_inline void atomic_long_andnot(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_long_andnot(i, v); } static __always_inline long atomic_long_fetch_andnot(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_andnot(i, v); } static __always_inline long atomic_long_fetch_andnot_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_andnot_acquire(i, v); } static __always_inline long atomic_long_fetch_andnot_release(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_andnot_release(i, v); } static __always_inline long atomic_long_fetch_andnot_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_andnot_relaxed(i, v); } static __always_inline void atomic_long_or(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_long_or(i, v); } static __always_inline long atomic_long_fetch_or(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_or(i, v); } static __always_inline long atomic_long_fetch_or_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_or_acquire(i, v); } static __always_inline long atomic_long_fetch_or_release(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_or_release(i, v); } static __always_inline long atomic_long_fetch_or_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_or_relaxed(i, v); } static __always_inline void atomic_long_xor(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); arch_atomic_long_xor(i, v); } static __always_inline long atomic_long_fetch_xor(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_xor(i, v); } static __always_inline long atomic_long_fetch_xor_acquire(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_xor_acquire(i, v); } static __always_inline long atomic_long_fetch_xor_release(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_xor_release(i, v); } static __always_inline long atomic_long_fetch_xor_relaxed(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_xor_relaxed(i, v); } static __always_inline long atomic_long_xchg(atomic_long_t *v, long i) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_xchg(v, i); } static __always_inline long atomic_long_xchg_acquire(atomic_long_t *v, long i) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_xchg_acquire(v, i); } static __always_inline long atomic_long_xchg_release(atomic_long_t *v, long i) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_xchg_release(v, i); } static __always_inline long atomic_long_xchg_relaxed(atomic_long_t *v, long i) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_xchg_relaxed(v, i); } static __always_inline long atomic_long_cmpxchg(atomic_long_t *v, long old, long new) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_cmpxchg(v, old, new); } static __always_inline long atomic_long_cmpxchg_acquire(atomic_long_t *v, long old, long new) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_cmpxchg_acquire(v, old, new); } static __always_inline long atomic_long_cmpxchg_release(atomic_long_t *v, long old, long new) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_cmpxchg_release(v, old, new); } static __always_inline long atomic_long_cmpxchg_relaxed(atomic_long_t *v, long old, long new) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_cmpxchg_relaxed(v, old, new); } static __always_inline bool atomic_long_try_cmpxchg(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_long_try_cmpxchg(v, old, new); } static __always_inline bool atomic_long_try_cmpxchg_acquire(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_long_try_cmpxchg_acquire(v, old, new); } static __always_inline bool atomic_long_try_cmpxchg_release(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_long_try_cmpxchg_release(v, old, new); } static __always_inline bool atomic_long_try_cmpxchg_relaxed(atomic_long_t *v, long *old, long new) { instrument_atomic_read_write(v, sizeof(*v)); instrument_atomic_read_write(old, sizeof(*old)); return arch_atomic_long_try_cmpxchg_relaxed(v, old, new); } static __always_inline bool atomic_long_sub_and_test(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_sub_and_test(i, v); } static __always_inline bool atomic_long_dec_and_test(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_dec_and_test(v); } static __always_inline bool atomic_long_inc_and_test(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_inc_and_test(v); } static __always_inline bool atomic_long_add_negative(long i, atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_add_negative(i, v); } static __always_inline long atomic_long_fetch_add_unless(atomic_long_t *v, long a, long u) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_fetch_add_unless(v, a, u); } static __always_inline bool atomic_long_add_unless(atomic_long_t *v, long a, long u) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_add_unless(v, a, u); } static __always_inline bool atomic_long_inc_not_zero(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_inc_not_zero(v); } static __always_inline bool atomic_long_inc_unless_negative(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_inc_unless_negative(v); } static __always_inline bool atomic_long_dec_unless_positive(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_dec_unless_positive(v); } static __always_inline long atomic_long_dec_if_positive(atomic_long_t *v) { instrument_atomic_read_write(v, sizeof(*v)); return arch_atomic_long_dec_if_positive(v); } #define xchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_xchg(__ai_ptr, __VA_ARGS__); \ }) #define xchg_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_xchg_acquire(__ai_ptr, __VA_ARGS__); \ }) #define xchg_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_xchg_release(__ai_ptr, __VA_ARGS__); \ }) #define xchg_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_xchg_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg_acquire(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg_release(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg64(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_acquire(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg64_acquire(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_release(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg64_release(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_relaxed(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg64_relaxed(__ai_ptr, __VA_ARGS__); \ }) #define try_cmpxchg(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \ arch_try_cmpxchg(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg_acquire(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \ arch_try_cmpxchg_acquire(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg_release(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \ arch_try_cmpxchg_release(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define try_cmpxchg_relaxed(ptr, oldp, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ typeof(oldp) __ai_oldp = (oldp); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ instrument_atomic_write(__ai_oldp, sizeof(*__ai_oldp)); \ arch_try_cmpxchg_relaxed(__ai_ptr, __ai_oldp, __VA_ARGS__); \ }) #define cmpxchg_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg_local(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg64_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_cmpxchg64_local(__ai_ptr, __VA_ARGS__); \ }) #define sync_cmpxchg(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, sizeof(*__ai_ptr)); \ arch_sync_cmpxchg(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_double(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, 2 * sizeof(*__ai_ptr)); \ arch_cmpxchg_double(__ai_ptr, __VA_ARGS__); \ }) #define cmpxchg_double_local(ptr, ...) \ ({ \ typeof(ptr) __ai_ptr = (ptr); \ instrument_atomic_write(__ai_ptr, 2 * sizeof(*__ai_ptr)); \ arch_cmpxchg_double_local(__ai_ptr, __VA_ARGS__); \ }) #endif /* _LINUX_ATOMIC_INSTRUMENTED_H */ // 2a9553f0a9d5619f19151092df5cabbbf16ce835
5 5 3 4 3 3 1 4 3 3 5 5 3 3 3 3 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Linus Lüssing */ #include "multicast.h" #include "main.h" #include <linux/atomic.h> #include <linux/bitops.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/icmpv6.h> #include <linux/if_bridge.h> #include <linux/if_ether.h> #include <linux/igmp.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/inetdevice.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/types.h> #include <linux/workqueue.h> #include <net/addrconf.h> #include <net/genetlink.h> #include <net/if_inet6.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/sock.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "send.h" #include "soft-interface.h" #include "translation-table.h" #include "tvlv.h" static void batadv_mcast_mla_update(struct work_struct *work); /** * batadv_mcast_start_timer() - schedule the multicast periodic worker * @bat_priv: the bat priv with all the soft interface information */ static void batadv_mcast_start_timer(struct batadv_priv *bat_priv) { queue_delayed_work(batadv_event_workqueue, &bat_priv->mcast.work, msecs_to_jiffies(BATADV_MCAST_WORK_PERIOD)); } /** * batadv_mcast_get_bridge() - get the bridge on top of the softif if it exists * @soft_iface: netdev struct of the mesh interface * * If the given soft interface has a bridge on top then the refcount * of the according net device is increased. * * Return: NULL if no such bridge exists. Otherwise the net device of the * bridge. */ static struct net_device *batadv_mcast_get_bridge(struct net_device *soft_iface) { struct net_device *upper = soft_iface; rcu_read_lock(); do { upper = netdev_master_upper_dev_get_rcu(upper); } while (upper && !(upper->priv_flags & IFF_EBRIDGE)); dev_hold(upper); rcu_read_unlock(); return upper; } /** * batadv_mcast_mla_rtr_flags_softif_get_ipv4() - get mcast router flags from * node for IPv4 * @dev: the interface to check * * Checks the presence of an IPv4 multicast router on this node. * * Caller needs to hold rcu read lock. * * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR4 otherwise. */ static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv4(struct net_device *dev) { struct in_device *in_dev = __in_dev_get_rcu(dev); if (in_dev && IN_DEV_MFORWARD(in_dev)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR4; } /** * batadv_mcast_mla_rtr_flags_softif_get_ipv6() - get mcast router flags from * node for IPv6 * @dev: the interface to check * * Checks the presence of an IPv6 multicast router on this node. * * Caller needs to hold rcu read lock. * * Return: BATADV_NO_FLAGS if present, BATADV_MCAST_WANT_NO_RTR6 otherwise. */ #if IS_ENABLED(CONFIG_IPV6_MROUTE) static u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) { struct inet6_dev *in6_dev = __in6_dev_get(dev); if (in6_dev && atomic_read(&in6_dev->cnf.mc_forwarding)) return BATADV_NO_FLAGS; else return BATADV_MCAST_WANT_NO_RTR6; } #else static inline u8 batadv_mcast_mla_rtr_flags_softif_get_ipv6(struct net_device *dev) { return BATADV_MCAST_WANT_NO_RTR6; } #endif /** * batadv_mcast_mla_rtr_flags_softif_get() - get mcast router flags from node * @bat_priv: the bat priv with all the soft interface information * @bridge: bridge interface on top of the soft_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers on this * node. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_softif_get(struct batadv_priv *bat_priv, struct net_device *bridge) { struct net_device *dev = bridge ? bridge : bat_priv->soft_iface; u8 flags = BATADV_NO_FLAGS; rcu_read_lock(); flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv4(dev); flags |= batadv_mcast_mla_rtr_flags_softif_get_ipv6(dev); rcu_read_unlock(); return flags; } /** * batadv_mcast_mla_rtr_flags_bridge_get() - get mcast router flags from bridge * @bat_priv: the bat priv with all the soft interface information * @bridge: bridge interface on top of the soft_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers behind a bridge. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_bridge_get(struct batadv_priv *bat_priv, struct net_device *bridge) { struct net_device *dev = bat_priv->soft_iface; u8 flags = BATADV_NO_FLAGS; if (!bridge) return BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; if (!br_multicast_has_router_adjacent(dev, ETH_P_IP)) flags |= BATADV_MCAST_WANT_NO_RTR4; if (!br_multicast_has_router_adjacent(dev, ETH_P_IPV6)) flags |= BATADV_MCAST_WANT_NO_RTR6; return flags; } /** * batadv_mcast_mla_rtr_flags_get() - get multicast router flags * @bat_priv: the bat priv with all the soft interface information * @bridge: bridge interface on top of the soft_iface if present, * otherwise pass NULL * * Checks the presence of IPv4 and IPv6 multicast routers on this * node or behind its bridge. * * Return: * BATADV_NO_FLAGS: Both an IPv4 and IPv6 multicast router is present * BATADV_MCAST_WANT_NO_RTR4: No IPv4 multicast router is present * BATADV_MCAST_WANT_NO_RTR6: No IPv6 multicast router is present * The former two OR'd: no multicast router is present */ static u8 batadv_mcast_mla_rtr_flags_get(struct batadv_priv *bat_priv, struct net_device *bridge) { u8 flags = BATADV_MCAST_WANT_NO_RTR4 | BATADV_MCAST_WANT_NO_RTR6; flags &= batadv_mcast_mla_rtr_flags_softif_get(bat_priv, bridge); flags &= batadv_mcast_mla_rtr_flags_bridge_get(bat_priv, bridge); return flags; } /** * batadv_mcast_mla_flags_get() - get the new multicast flags * @bat_priv: the bat priv with all the soft interface information * * Return: A set of flags for the current/next TVLV, querier and * bridge state. */ static struct batadv_mcast_mla_flags batadv_mcast_mla_flags_get(struct batadv_priv *bat_priv) { struct net_device *dev = bat_priv->soft_iface; struct batadv_mcast_querier_state *qr4, *qr6; struct batadv_mcast_mla_flags mla_flags; struct net_device *bridge; bridge = batadv_mcast_get_bridge(dev); memset(&mla_flags, 0, sizeof(mla_flags)); mla_flags.enabled = 1; mla_flags.tvlv_flags |= batadv_mcast_mla_rtr_flags_get(bat_priv, bridge); if (!bridge) return mla_flags; dev_put(bridge); mla_flags.bridged = 1; qr4 = &mla_flags.querier_ipv4; qr6 = &mla_flags.querier_ipv6; if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)) pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); qr4->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); qr4->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); qr6->exists = br_multicast_has_querier_anywhere(dev, ETH_P_IPV6); qr6->shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IPV6); mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_UNSNOOPABLES; /* 1) If no querier exists at all, then multicast listeners on * our local TT clients behind the bridge will keep silent. * 2) If the selected querier is on one of our local TT clients, * behind the bridge, then this querier might shadow multicast * listeners on our local TT clients, behind this bridge. * * In both cases, we will signalize other batman nodes that * we need all multicast traffic of the according protocol. */ if (!qr4->exists || qr4->shadowing) { mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV4; mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR4; } if (!qr6->exists || qr6->shadowing) { mla_flags.tvlv_flags |= BATADV_MCAST_WANT_ALL_IPV6; mla_flags.tvlv_flags &= ~BATADV_MCAST_WANT_NO_RTR6; } return mla_flags; } /** * batadv_mcast_mla_is_duplicate() - check whether an address is in a list * @mcast_addr: the multicast address to check * @mcast_list: the list with multicast addresses to search in * * Return: true if the given address is already in the given list. * Otherwise returns false. */ static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; hlist_for_each_entry(mcast_entry, mcast_list, list) if (batadv_compare_eth(mcast_entry->addr, mcast_addr)) return true; return false; } /** * batadv_mcast_mla_softif_get_ipv4() - get softif IPv4 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of IPv4 multicast listeners residing * on this kernel on the given soft interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_softif_get_ipv4(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct batadv_hw_addr *new; struct in_device *in_dev; u8 mcast_addr[ETH_ALEN]; struct ip_mc_list *pmc; int ret = 0; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) return 0; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return 0; } for (pmc = rcu_dereference(in_dev->mc_list); pmc; pmc = rcu_dereference(pmc->next_rcu)) { if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv4_is_local_multicast(pmc->multiaddr)) continue; if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && !ipv4_is_local_multicast(pmc->multiaddr)) continue; ip_eth_mc_map(pmc->multiaddr, mcast_addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); ret++; } rcu_read_unlock(); return ret; } /** * batadv_mcast_mla_softif_get_ipv6() - get softif IPv6 multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of IPv6 multicast listeners residing * on this kernel on the given soft interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ #if IS_ENABLED(CONFIG_IPV6) static int batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct batadv_hw_addr *new; struct inet6_dev *in6_dev; u8 mcast_addr[ETH_ALEN]; struct ifmcaddr6 *pmc6; int ret = 0; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) return 0; rcu_read_lock(); in6_dev = __in6_dev_get(dev); if (!in6_dev) { rcu_read_unlock(); return 0; } for (pmc6 = rcu_dereference(in6_dev->mc_list); pmc6; pmc6 = rcu_dereference(pmc6->next)) { if (IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) continue; if (flags->tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv6_addr_is_ll_all_nodes(&pmc6->mca_addr)) continue; if (!(flags->tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && IPV6_ADDR_MC_SCOPE(&pmc6->mca_addr) > IPV6_ADDR_SCOPE_LINKLOCAL) continue; ipv6_eth_mc_map(&pmc6->mca_addr, mcast_addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); ret++; } rcu_read_unlock(); return ret; } #else static inline int batadv_mcast_mla_softif_get_ipv6(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { return 0; } #endif /** * batadv_mcast_mla_softif_get() - get softif multicast listeners * @dev: the device to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing * on this kernel on the given soft interface, dev, in * the given mcast_list. In general, multicast listeners provided by * your multicast receiving applications run directly on this node. * * If there is a bridge interface on top of dev, collect from that one * instead. Just like with IP addresses and routes, multicast listeners * will(/should) register to the bridge interface instead of an * enslaved bat0. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_softif_get(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct net_device *bridge = batadv_mcast_get_bridge(dev); int ret4, ret6 = 0; if (bridge) dev = bridge; ret4 = batadv_mcast_mla_softif_get_ipv4(dev, mcast_list, flags); if (ret4 < 0) goto out; ret6 = batadv_mcast_mla_softif_get_ipv6(dev, mcast_list, flags); if (ret6 < 0) { ret4 = 0; goto out; } out: dev_put(bridge); return ret4 + ret6; } /** * batadv_mcast_mla_br_addr_cpy() - copy a bridge multicast address * @dst: destination to write to - a multicast MAC address * @src: source to read from - a multicast IP address * * Converts a given multicast IPv4/IPv6 address from a bridge * to its matching multicast MAC address and copies it into the given * destination buffer. * * Caller needs to make sure the destination buffer can hold * at least ETH_ALEN bytes. */ static void batadv_mcast_mla_br_addr_cpy(char *dst, const struct br_ip *src) { if (src->proto == htons(ETH_P_IP)) ip_eth_mc_map(src->dst.ip4, dst); #if IS_ENABLED(CONFIG_IPV6) else if (src->proto == htons(ETH_P_IPV6)) ipv6_eth_mc_map(&src->dst.ip6, dst); #endif else eth_zero_addr(dst); } /** * batadv_mcast_mla_bridge_get() - get bridged-in multicast listeners * @dev: a bridge slave whose bridge to collect multicast addresses from * @mcast_list: a list to put found addresses into * @flags: flags indicating the new multicast state * * Collects multicast addresses of multicast listeners residing * on foreign, non-mesh devices which we gave access to our mesh via * a bridge on top of the given soft interface, dev, in the given * mcast_list. * * Return: -ENOMEM on memory allocation error or the number of * items added to the mcast_list otherwise. */ static int batadv_mcast_mla_bridge_get(struct net_device *dev, struct hlist_head *mcast_list, struct batadv_mcast_mla_flags *flags) { struct list_head bridge_mcast_list = LIST_HEAD_INIT(bridge_mcast_list); struct br_ip_list *br_ip_entry, *tmp; u8 tvlv_flags = flags->tvlv_flags; struct batadv_hw_addr *new; u8 mcast_addr[ETH_ALEN]; int ret; /* we don't need to detect these devices/listeners, the IGMP/MLD * snooping code of the Linux bridge already does that for us */ ret = br_multicast_list_adjacent(dev, &bridge_mcast_list); if (ret < 0) goto out; list_for_each_entry(br_ip_entry, &bridge_mcast_list, list) { if (br_ip_entry->addr.proto == htons(ETH_P_IP)) { if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV4) continue; if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4)) continue; if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR4) && !ipv4_is_local_multicast(br_ip_entry->addr.dst.ip4)) continue; } #if IS_ENABLED(CONFIG_IPV6) if (br_ip_entry->addr.proto == htons(ETH_P_IPV6)) { if (tvlv_flags & BATADV_MCAST_WANT_ALL_IPV6) continue; if (tvlv_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && ipv6_addr_is_ll_all_nodes(&br_ip_entry->addr.dst.ip6)) continue; if (!(tvlv_flags & BATADV_MCAST_WANT_NO_RTR6) && IPV6_ADDR_MC_SCOPE(&br_ip_entry->addr.dst.ip6) > IPV6_ADDR_SCOPE_LINKLOCAL) continue; } #endif batadv_mcast_mla_br_addr_cpy(mcast_addr, &br_ip_entry->addr); if (batadv_mcast_mla_is_duplicate(mcast_addr, mcast_list)) continue; new = kmalloc(sizeof(*new), GFP_ATOMIC); if (!new) { ret = -ENOMEM; break; } ether_addr_copy(new->addr, mcast_addr); hlist_add_head(&new->list, mcast_list); } out: list_for_each_entry_safe(br_ip_entry, tmp, &bridge_mcast_list, list) { list_del(&br_ip_entry->list); kfree(br_ip_entry); } return ret; } /** * batadv_mcast_mla_list_free() - free a list of multicast addresses * @mcast_list: the list to free * * Removes and frees all items in the given mcast_list. */ static void batadv_mcast_mla_list_free(struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { hlist_del(&mcast_entry->list); kfree(mcast_entry); } } /** * batadv_mcast_mla_tt_retract() - clean up multicast listener announcements * @bat_priv: the bat priv with all the soft interface information * @mcast_list: a list of addresses which should _not_ be removed * * Retracts the announcement of any multicast listener from the * translation table except the ones listed in the given mcast_list. * * If mcast_list is NULL then all are retracted. */ static void batadv_mcast_mla_tt_retract(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; hlist_for_each_entry_safe(mcast_entry, tmp, &bat_priv->mcast.mla_list, list) { if (mcast_list && batadv_mcast_mla_is_duplicate(mcast_entry->addr, mcast_list)) continue; batadv_tt_local_remove(bat_priv, mcast_entry->addr, BATADV_NO_FLAGS, "mcast TT outdated", false); hlist_del(&mcast_entry->list); kfree(mcast_entry); } } /** * batadv_mcast_mla_tt_add() - add multicast listener announcements * @bat_priv: the bat priv with all the soft interface information * @mcast_list: a list of addresses which are going to get added * * Adds multicast listener announcements from the given mcast_list to the * translation table if they have not been added yet. */ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv, struct hlist_head *mcast_list) { struct batadv_hw_addr *mcast_entry; struct hlist_node *tmp; if (!mcast_list) return; hlist_for_each_entry_safe(mcast_entry, tmp, mcast_list, list) { if (batadv_mcast_mla_is_duplicate(mcast_entry->addr, &bat_priv->mcast.mla_list)) continue; if (!batadv_tt_local_add(bat_priv->soft_iface, mcast_entry->addr, BATADV_NO_FLAGS, BATADV_NULL_IFINDEX, BATADV_NO_MARK)) continue; hlist_del(&mcast_entry->list); hlist_add_head(&mcast_entry->list, &bat_priv->mcast.mla_list); } } /** * batadv_mcast_querier_log() - debug output regarding the querier status on * link * @bat_priv: the bat priv with all the soft interface information * @str_proto: a string for the querier protocol (e.g. "IGMP" or "MLD") * @old_state: the previous querier state on our link * @new_state: the new querier state on our link * * Outputs debug messages to the logging facility with log level 'mcast' * regarding changes to the querier status on the link which are relevant * to our multicast optimizations. * * Usually this is about whether a querier appeared or vanished in * our mesh or whether the querier is in the suboptimal position of being * behind our local bridge segment: Snooping switches will directly * forward listener reports to the querier, therefore batman-adv and * the bridge will potentially not see these listeners - the querier is * potentially shadowing listeners from us then. * * This is only interesting for nodes with a bridge on top of their * soft interface. */ static void batadv_mcast_querier_log(struct batadv_priv *bat_priv, char *str_proto, struct batadv_mcast_querier_state *old_state, struct batadv_mcast_querier_state *new_state) { if (!old_state->exists && new_state->exists) batadv_info(bat_priv->soft_iface, "%s Querier appeared\n", str_proto); else if (old_state->exists && !new_state->exists) batadv_info(bat_priv->soft_iface, "%s Querier disappeared - multicast optimizations disabled\n", str_proto); else if (!bat_priv->mcast.mla_flags.bridged && !new_state->exists) batadv_info(bat_priv->soft_iface, "No %s Querier present - multicast optimizations disabled\n", str_proto); if (new_state->exists) { if ((!old_state->shadowing && new_state->shadowing) || (!old_state->exists && new_state->shadowing)) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "%s Querier is behind our bridged segment: Might shadow listeners\n", str_proto); else if (old_state->shadowing && !new_state->shadowing) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "%s Querier is not behind our bridged segment\n", str_proto); } } /** * batadv_mcast_bridge_log() - debug output for topology changes in bridged * setups * @bat_priv: the bat priv with all the soft interface information * @new_flags: flags indicating the new multicast state * * If no bridges are ever used on this node, then this function does nothing. * * Otherwise this function outputs debug information to the 'mcast' log level * which might be relevant to our multicast optimizations. * * More precisely, it outputs information when a bridge interface is added or * removed from a soft interface. And when a bridge is present, it further * outputs information about the querier state which is relevant for the * multicast flags this node is going to set. */ static void batadv_mcast_bridge_log(struct batadv_priv *bat_priv, struct batadv_mcast_mla_flags *new_flags) { struct batadv_mcast_mla_flags *old_flags = &bat_priv->mcast.mla_flags; if (!old_flags->bridged && new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge added: Setting Unsnoopables(U)-flag\n"); else if (old_flags->bridged && !new_flags->bridged) batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Bridge removed: Unsetting Unsnoopables(U)-flag\n"); if (new_flags->bridged) { batadv_mcast_querier_log(bat_priv, "IGMP", &old_flags->querier_ipv4, &new_flags->querier_ipv4); batadv_mcast_querier_log(bat_priv, "MLD", &old_flags->querier_ipv6, &new_flags->querier_ipv6); } } /** * batadv_mcast_flags_log() - output debug information about mcast flag changes * @bat_priv: the bat priv with all the soft interface information * @flags: TVLV flags indicating the new multicast state * * Whenever the multicast TVLV flags this node announces change, this function * should be used to notify userspace about the change. */ static void batadv_mcast_flags_log(struct batadv_priv *bat_priv, u8 flags) { bool old_enabled = bat_priv->mcast.mla_flags.enabled; u8 old_flags = bat_priv->mcast.mla_flags.tvlv_flags; char str_old_flags[] = "[.... . ]"; sprintf(str_old_flags, "[%c%c%c%s%s]", (old_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (old_flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', !(old_flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", !(old_flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); batadv_dbg(BATADV_DBG_MCAST, bat_priv, "Changing multicast flags from '%s' to '[%c%c%c%s%s]'\n", old_enabled ? str_old_flags : "<undefined>", (flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) ? 'U' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV4) ? '4' : '.', (flags & BATADV_MCAST_WANT_ALL_IPV6) ? '6' : '.', !(flags & BATADV_MCAST_WANT_NO_RTR4) ? "R4" : ". ", !(flags & BATADV_MCAST_WANT_NO_RTR6) ? "R6" : ". "); } /** * batadv_mcast_mla_flags_update() - update multicast flags * @bat_priv: the bat priv with all the soft interface information * @flags: flags indicating the new multicast state * * Updates the own multicast tvlv with our current multicast related settings, * capabilities and inabilities. */ static void batadv_mcast_mla_flags_update(struct batadv_priv *bat_priv, struct batadv_mcast_mla_flags *flags) { struct batadv_tvlv_mcast_data mcast_data; if (!memcmp(flags, &bat_priv->mcast.mla_flags, sizeof(*flags))) return; batadv_mcast_bridge_log(bat_priv, flags); batadv_mcast_flags_log(bat_priv, flags->tvlv_flags); mcast_data.flags = flags->tvlv_flags; memset(mcast_data.reserved, 0, sizeof(mcast_data.reserved)); batadv_tvlv_container_register(bat_priv, BATADV_TVLV_MCAST, 2, &mcast_data, sizeof(mcast_data)); bat_priv->mcast.mla_flags = *flags; } /** * __batadv_mcast_mla_update() - update the own MLAs * @bat_priv: the bat priv with all the soft interface information * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. * * Note that non-conflicting reads and writes to bat_priv->mcast.mla_list * in batadv_mcast_mla_tt_retract() and batadv_mcast_mla_tt_add() are * ensured by the non-parallel execution of the worker this function * belongs to. */ static void __batadv_mcast_mla_update(struct batadv_priv *bat_priv) { struct net_device *soft_iface = bat_priv->soft_iface; struct hlist_head mcast_list = HLIST_HEAD_INIT; struct batadv_mcast_mla_flags flags; int ret; flags = batadv_mcast_mla_flags_get(bat_priv); ret = batadv_mcast_mla_softif_get(soft_iface, &mcast_list, &flags); if (ret < 0) goto out; ret = batadv_mcast_mla_bridge_get(soft_iface, &mcast_list, &flags); if (ret < 0) goto out; spin_lock(&bat_priv->mcast.mla_lock); batadv_mcast_mla_tt_retract(bat_priv, &mcast_list); batadv_mcast_mla_tt_add(bat_priv, &mcast_list); batadv_mcast_mla_flags_update(bat_priv, &flags); spin_unlock(&bat_priv->mcast.mla_lock); out: batadv_mcast_mla_list_free(&mcast_list); } /** * batadv_mcast_mla_update() - update the own MLAs * @work: kernel work struct * * Updates the own multicast listener announcements in the translation * table as well as the own, announced multicast tvlv container. * * In the end, reschedules the work timer. */ static void batadv_mcast_mla_update(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv_mcast *priv_mcast; struct batadv_priv *bat_priv; delayed_work = to_delayed_work(work); priv_mcast = container_of(delayed_work, struct batadv_priv_mcast, work); bat_priv = container_of(priv_mcast, struct batadv_priv, mcast); __batadv_mcast_mla_update(bat_priv); batadv_mcast_start_timer(bat_priv); } /** * batadv_mcast_is_report_ipv4() - check for IGMP reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. * * Checks whether the given frame is a valid IGMP report. * * Return: If so then true, otherwise false. */ static bool batadv_mcast_is_report_ipv4(struct sk_buff *skb) { if (ip_mc_check_igmp(skb) < 0) return false; switch (igmp_hdr(skb)->type) { case IGMP_HOST_MEMBERSHIP_REPORT: case IGMPV2_HOST_MEMBERSHIP_REPORT: case IGMPV3_HOST_MEMBERSHIP_REPORT: return true; } return false; } /** * batadv_mcast_forw_mode_check_ipv4() - check for optimized forwarding * potential * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv4 packet to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given IPv4 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL or -ENOMEM in case of memory * allocation failure. */ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct iphdr *iphdr; /* We might fail due to out-of-memory -> drop it */ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*iphdr))) return -ENOMEM; if (batadv_mcast_is_report_ipv4(skb)) return -EINVAL; iphdr = ip_hdr(skb); /* link-local multicast listeners behind a bridge are * not snoopable (see RFC4541, section 2.1.2.2) */ if (ipv4_is_local_multicast(iphdr->daddr)) *is_unsnoopable = true; else *is_routable = ETH_P_IP; return 0; } /** * batadv_mcast_is_report_ipv6() - check for MLD reports * @skb: the ethernet frame destined for the mesh * * This call might reallocate skb data. * * Checks whether the given frame is a valid MLD report. * * Return: If so then true, otherwise false. */ static bool batadv_mcast_is_report_ipv6(struct sk_buff *skb) { if (ipv6_mc_check_mld(skb) < 0) return false; switch (icmp6_hdr(skb)->icmp6_type) { case ICMPV6_MGM_REPORT: case ICMPV6_MLD2_REPORT: return true; } return false; } /** * batadv_mcast_forw_mode_check_ipv6() - check for optimized forwarding * potential * @bat_priv: the bat priv with all the soft interface information * @skb: the IPv6 packet to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given IPv6 packet has the potential to be forwarded with a * mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct ipv6hdr *ip6hdr; /* We might fail due to out-of-memory -> drop it */ if (!pskb_may_pull(skb, sizeof(struct ethhdr) + sizeof(*ip6hdr))) return -ENOMEM; if (batadv_mcast_is_report_ipv6(skb)) return -EINVAL; ip6hdr = ipv6_hdr(skb); if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) < IPV6_ADDR_SCOPE_LINKLOCAL) return -EINVAL; /* link-local-all-nodes multicast listeners behind a bridge are * not snoopable (see RFC4541, section 3, paragraph 3) */ if (ipv6_addr_is_ll_all_nodes(&ip6hdr->daddr)) *is_unsnoopable = true; else if (IPV6_ADDR_MC_SCOPE(&ip6hdr->daddr) > IPV6_ADDR_SCOPE_LINKLOCAL) *is_routable = ETH_P_IPV6; return 0; } /** * batadv_mcast_forw_mode_check() - check for optimized forwarding potential * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast frame to check * @is_unsnoopable: stores whether the destination is snoopable * @is_routable: stores whether the destination is routable * * Checks whether the given multicast ethernet frame has the potential to be * forwarded with a mode more optimal than classic flooding. * * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory */ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, struct sk_buff *skb, bool *is_unsnoopable, int *is_routable) { struct ethhdr *ethhdr = eth_hdr(skb); if (!atomic_read(&bat_priv->multicast_mode)) return -EINVAL; switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, is_unsnoopable, is_routable); case ETH_P_IPV6: if (!IS_ENABLED(CONFIG_IPV6)) return -EINVAL; return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb, is_unsnoopable, is_routable); default: return -EINVAL; } } /** * batadv_mcast_forw_want_all_ip_count() - count nodes with unspecific mcast * interest * @bat_priv: the bat priv with all the soft interface information * @ethhdr: ethernet header of a packet * * Return: the number of nodes which want all IPv4 multicast traffic if the * given ethhdr is from an IPv4 packet or the number of nodes which want all * IPv6 traffic if it matches an IPv6 packet. */ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv, struct ethhdr *ethhdr) { switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return atomic_read(&bat_priv->mcast.num_want_all_ipv4); case ETH_P_IPV6: return atomic_read(&bat_priv->mcast.num_want_all_ipv6); default: /* we shouldn't be here... */ return 0; } } /** * batadv_mcast_forw_rtr_count() - count nodes with a multicast router * @bat_priv: the bat priv with all the soft interface information * @protocol: the ethernet protocol type to count multicast routers for * * Return: the number of nodes which want all routable IPv4 multicast traffic * if the protocol is ETH_P_IP or the number of nodes which want all routable * IPv6 traffic if the protocol is ETH_P_IPV6. Otherwise returns 0. */ static int batadv_mcast_forw_rtr_count(struct batadv_priv *bat_priv, int protocol) { switch (protocol) { case ETH_P_IP: return atomic_read(&bat_priv->mcast.num_want_all_rtr4); case ETH_P_IPV6: return atomic_read(&bat_priv->mcast.num_want_all_rtr6); default: return 0; } } /** * batadv_mcast_forw_tt_node_get() - get a multicast tt node * @bat_priv: the bat priv with all the soft interface information * @ethhdr: the ether header containing the multicast destination * * Return: an orig_node matching the multicast address provided by ethhdr * via a translation table lookup. This increases the returned nodes refcount. */ static struct batadv_orig_node * batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv, struct ethhdr *ethhdr) { return batadv_transtable_search(bat_priv, NULL, ethhdr->h_dest, BATADV_NO_FLAGS); } /** * batadv_mcast_forw_ipv4_node_get() - get a node with an ipv4 flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and * increases its refcount. */ static struct batadv_orig_node * batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv) { struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_ipv4_list, mcast_want_all_ipv4_node) { if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; break; } rcu_read_unlock(); return orig_node; } /** * batadv_mcast_forw_ipv6_node_get() - get a node with an ipv6 flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set * and increases its refcount. */ static struct batadv_orig_node * batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv) { struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_ipv6_list, mcast_want_all_ipv6_node) { if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; break; } rcu_read_unlock(); return orig_node; } /** * batadv_mcast_forw_ip_node_get() - get a node with an ipv4/ipv6 flag * @bat_priv: the bat priv with all the soft interface information * @ethhdr: an ethernet header to determine the protocol family from * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or * BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, sets and * increases its refcount. */ static struct batadv_orig_node * batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv, struct ethhdr *ethhdr) { switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_ipv4_node_get(bat_priv); case ETH_P_IPV6: return batadv_mcast_forw_ipv6_node_get(bat_priv); default: /* we shouldn't be here... */ return NULL; } } /** * batadv_mcast_forw_unsnoop_node_get() - get a node with an unsnoopable flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag * set and increases its refcount. */ static struct batadv_orig_node * batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv) { struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_unsnoopables_list, mcast_want_all_unsnoopables_node) { if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; break; } rcu_read_unlock(); return orig_node; } /** * batadv_mcast_forw_rtr4_node_get() - get a node with an ipv4 mcast router flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_NO_RTR4 flag unset and * increases its refcount. */ static struct batadv_orig_node * batadv_mcast_forw_rtr4_node_get(struct batadv_priv *bat_priv) { struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_rtr4_list, mcast_want_all_rtr4_node) { if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; break; } rcu_read_unlock(); return orig_node; } /** * batadv_mcast_forw_rtr6_node_get() - get a node with an ipv6 mcast router flag * @bat_priv: the bat priv with all the soft interface information * * Return: an orig_node which has the BATADV_MCAST_WANT_NO_RTR6 flag unset * and increases its refcount. */ static struct batadv_orig_node * batadv_mcast_forw_rtr6_node_get(struct batadv_priv *bat_priv) { struct batadv_orig_node *tmp_orig_node, *orig_node = NULL; rcu_read_lock(); hlist_for_each_entry_rcu(tmp_orig_node, &bat_priv->mcast.want_all_rtr6_list, mcast_want_all_rtr6_node) { if (!kref_get_unless_zero(&tmp_orig_node->refcount)) continue; orig_node = tmp_orig_node; break; } rcu_read_unlock(); return orig_node; } /** * batadv_mcast_forw_rtr_node_get() - get a node with an ipv4/ipv6 router flag * @bat_priv: the bat priv with all the soft interface information * @ethhdr: an ethernet header to determine the protocol family from * * Return: an orig_node which has no BATADV_MCAST_WANT_NO_RTR4 or * BATADV_MCAST_WANT_NO_RTR6 flag, depending on the provided ethhdr, set and * increases its refcount. */ static struct batadv_orig_node * batadv_mcast_forw_rtr_node_get(struct batadv_priv *bat_priv, struct ethhdr *ethhdr) { switch (ntohs(ethhdr->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_rtr4_node_get(bat_priv); case ETH_P_IPV6: return batadv_mcast_forw_rtr6_node_get(bat_priv); default: /* we shouldn't be here... */ return NULL; } } /** * batadv_mcast_forw_mode() - check on how to forward a multicast packet * @bat_priv: the bat priv with all the soft interface information * @skb: The multicast packet to check * @orig: an originator to be set to forward the skb to * @is_routable: stores whether the destination is routable * * Return: the forwarding mode as enum batadv_forw_mode and in case of * BATADV_FORW_SINGLE set the orig to the single originator the skb * should be forwarded to. */ enum batadv_forw_mode batadv_mcast_forw_mode(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node **orig, int *is_routable) { int ret, tt_count, ip_count, unsnoop_count, total_count; bool is_unsnoopable = false; unsigned int mcast_fanout; struct ethhdr *ethhdr; int rtr_count = 0; ret = batadv_mcast_forw_mode_check(bat_priv, skb, &is_unsnoopable, is_routable); if (ret == -ENOMEM) return BATADV_FORW_NONE; else if (ret < 0) return BATADV_FORW_ALL; ethhdr = eth_hdr(skb); tt_count = batadv_tt_global_hash_count(bat_priv, ethhdr->h_dest, BATADV_NO_FLAGS); ip_count = batadv_mcast_forw_want_all_ip_count(bat_priv, ethhdr); unsnoop_count = !is_unsnoopable ? 0 : atomic_read(&bat_priv->mcast.num_want_all_unsnoopables); rtr_count = batadv_mcast_forw_rtr_count(bat_priv, *is_routable); total_count = tt_count + ip_count + unsnoop_count + rtr_count; switch (total_count) { case 1: if (tt_count) *orig = batadv_mcast_forw_tt_node_get(bat_priv, ethhdr); else if (ip_count) *orig = batadv_mcast_forw_ip_node_get(bat_priv, ethhdr); else if (unsnoop_count) *orig = batadv_mcast_forw_unsnoop_node_get(bat_priv); else if (rtr_count) *orig = batadv_mcast_forw_rtr_node_get(bat_priv, ethhdr); if (*orig) return BATADV_FORW_SINGLE; fallthrough; case 0: return BATADV_FORW_NONE; default: mcast_fanout = atomic_read(&bat_priv->multicast_fanout); if (!unsnoop_count && total_count <= mcast_fanout) return BATADV_FORW_SOME; } return BATADV_FORW_ALL; } /** * batadv_mcast_forw_send_orig() - send a multicast packet to an originator * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to send * @vid: the vlan identifier * @orig_node: the originator to send the packet to * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_mcast_forw_send_orig(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, struct batadv_orig_node *orig_node) { /* Avoid sending multicast-in-unicast packets to other BLA * gateways - they already got the frame from the LAN side * we share with them. * TODO: Refactor to take BLA into account earlier, to avoid * reducing the mcast_fanout count. */ if (batadv_bla_is_backbone_gw_orig(bat_priv, orig_node->orig, vid)) { dev_kfree_skb(skb); return NET_XMIT_SUCCESS; } return batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST, 0, orig_node, vid); } /** * batadv_mcast_forw_tt() - forwards a packet to multicast listeners * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any multicast * listener registered in the translation table. A transmission is performed * via a batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_tt(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; struct batadv_tt_orig_list_entry *orig_entry; struct batadv_tt_global_entry *tt_global; const u8 *addr = eth_hdr(skb)->h_dest; tt_global = batadv_tt_global_hash_find(bat_priv, addr, vid); if (!tt_global) goto out; rcu_read_lock(); hlist_for_each_entry_rcu(orig_entry, &tt_global->orig_list, list) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_entry->orig_node); } rcu_read_unlock(); batadv_tt_global_entry_put(tt_global); out: return ret; } /** * batadv_mcast_forw_want_all_ipv4() - forward to nodes with want-all-ipv4 * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV4 flag set. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_ipv4(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_ipv4_list, mcast_want_all_ipv4_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all_ipv6() - forward to nodes with want-all-ipv6 * @bat_priv: the bat priv with all the soft interface information * @skb: The multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV6 flag set. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_ipv6(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_ipv6_list, mcast_want_all_ipv6_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all() - forward packet to nodes in a want-all list * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_ALL_IPV4 or BATADV_MCAST_WANT_ALL_IPV6 flag set. A * transmission is performed via a batman-adv unicast packet for each such * destination node. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_want_all(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_want_all_ipv4(bat_priv, skb, vid); case ETH_P_IPV6: return batadv_mcast_forw_want_all_ipv6(bat_priv, skb, vid); default: /* we shouldn't be here... */ return NET_XMIT_DROP; } } /** * batadv_mcast_forw_want_all_rtr4() - forward to nodes with want-all-rtr4 * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR4 flag unset. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_rtr4(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_rtr4_list, mcast_want_all_rtr4_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_all_rtr6() - forward to nodes with want-all-rtr6 * @bat_priv: the bat priv with all the soft interface information * @skb: The multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR6 flag unset. A transmission is performed via a * batman-adv unicast packet for each such destination node. * * Return: NET_XMIT_DROP on memory allocation failure, NET_XMIT_SUCCESS * otherwise. */ static int batadv_mcast_forw_want_all_rtr6(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret = NET_XMIT_SUCCESS; struct sk_buff *newskb; rcu_read_lock(); hlist_for_each_entry_rcu(orig_node, &bat_priv->mcast.want_all_rtr6_list, mcast_want_all_rtr6_node) { newskb = skb_copy(skb, GFP_ATOMIC); if (!newskb) { ret = NET_XMIT_DROP; break; } batadv_mcast_forw_send_orig(bat_priv, newskb, vid, orig_node); } rcu_read_unlock(); return ret; } /** * batadv_mcast_forw_want_rtr() - forward packet to nodes in a want-all-rtr list * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * * Sends copies of a frame with multicast destination to any node with a * BATADV_MCAST_WANT_NO_RTR4 or BATADV_MCAST_WANT_NO_RTR6 flag unset. A * transmission is performed via a batman-adv unicast packet for each such * destination node. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ static int batadv_mcast_forw_want_rtr(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { switch (ntohs(eth_hdr(skb)->h_proto)) { case ETH_P_IP: return batadv_mcast_forw_want_all_rtr4(bat_priv, skb, vid); case ETH_P_IPV6: return batadv_mcast_forw_want_all_rtr6(bat_priv, skb, vid); default: /* we shouldn't be here... */ return NET_XMIT_DROP; } } /** * batadv_mcast_forw_send() - send packet to any detected multicast recipient * @bat_priv: the bat priv with all the soft interface information * @skb: the multicast packet to transmit * @vid: the vlan identifier * @is_routable: stores whether the destination is routable * * Sends copies of a frame with multicast destination to any node that signaled * interest in it, that is either via the translation table or the according * want-all flags. A transmission is performed via a batman-adv unicast packet * for each such destination node. * * The given skb is consumed/freed. * * Return: NET_XMIT_DROP on memory allocation failure or if the protocol family * is neither IPv4 nor IPv6. NET_XMIT_SUCCESS otherwise. */ int batadv_mcast_forw_send(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int is_routable) { int ret; ret = batadv_mcast_forw_tt(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } ret = batadv_mcast_forw_want_all(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } if (!is_routable) goto skip_mc_router; ret = batadv_mcast_forw_want_rtr(bat_priv, skb, vid); if (ret != NET_XMIT_SUCCESS) { kfree_skb(skb); return ret; } skip_mc_router: consume_skb(skb); return ret; } /** * batadv_mcast_want_unsnoop_update() - update unsnoop counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag of this originator, * orig, has toggled then this method updates the counter and the list * accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_unsnoop_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_unsnoopables_node; struct hlist_head *head = &bat_priv->mcast.want_all_unsnoopables_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES)) { atomic_inc(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_UNSNOOPABLES) { atomic_dec(&bat_priv->mcast.num_want_all_unsnoopables); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_ipv4_update() - update want-all-ipv4 counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV4 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv4_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv4_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) { atomic_dec(&bat_priv->mcast.num_want_all_ipv4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_ipv6_update() - update want-all-ipv6 counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_ALL_IPV6 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_ipv6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_ipv6_node; struct hlist_head *head = &bat_priv->mcast.want_all_ipv6_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag unset to set */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6 && !(orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6)) { atomic_inc(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag set to unset */ } else if (!(mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) && orig->mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) { atomic_dec(&bat_priv->mcast.num_want_all_ipv6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_rtr4_update() - update want-all-rtr4 counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR4 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_rtr4_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_rtr4_node; struct hlist_head *head = &bat_priv->mcast.want_all_rtr4_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR4) && orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4) { atomic_inc(&bat_priv->mcast.num_want_all_rtr4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag unset to set */ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR4 && !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR4)) { atomic_dec(&bat_priv->mcast.num_want_all_rtr4); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_want_rtr6_update() - update want-all-rtr6 counter and list * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node which multicast state might have changed of * @mcast_flags: flags indicating the new multicast state * * If the BATADV_MCAST_WANT_NO_RTR6 flag of this originator, orig, has * toggled then this method updates the counter and the list accordingly. * * Caller needs to hold orig->mcast_handler_lock. */ static void batadv_mcast_want_rtr6_update(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 mcast_flags) { struct hlist_node *node = &orig->mcast_want_all_rtr6_node; struct hlist_head *head = &bat_priv->mcast.want_all_rtr6_list; lockdep_assert_held(&orig->mcast_handler_lock); /* switched from flag set to unset */ if (!(mcast_flags & BATADV_MCAST_WANT_NO_RTR6) && orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6) { atomic_inc(&bat_priv->mcast.num_want_all_rtr6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(!hlist_unhashed(node)); hlist_add_head_rcu(node, head); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); /* switched from flag unset to set */ } else if (mcast_flags & BATADV_MCAST_WANT_NO_RTR6 && !(orig->mcast_flags & BATADV_MCAST_WANT_NO_RTR6)) { atomic_dec(&bat_priv->mcast.num_want_all_rtr6); spin_lock_bh(&bat_priv->mcast.want_lists_lock); /* flag checks above + mcast_handler_lock prevents this */ WARN_ON(hlist_unhashed(node)); hlist_del_init_rcu(node); spin_unlock_bh(&bat_priv->mcast.want_lists_lock); } } /** * batadv_mcast_tvlv_flags_get() - get multicast flags from an OGM TVLV * @enabled: whether the originator has multicast TVLV support enabled * @tvlv_value: tvlv buffer containing the multicast flags * @tvlv_value_len: tvlv buffer length * * Return: multicast flags for the given tvlv buffer */ static u8 batadv_mcast_tvlv_flags_get(bool enabled, void *tvlv_value, u16 tvlv_value_len) { u8 mcast_flags = BATADV_NO_FLAGS; if (enabled && tvlv_value && tvlv_value_len >= sizeof(mcast_flags)) mcast_flags = *(u8 *)tvlv_value; if (!enabled) { mcast_flags |= BATADV_MCAST_WANT_ALL_IPV4; mcast_flags |= BATADV_MCAST_WANT_ALL_IPV6; } /* remove redundant flags to avoid sending duplicate packets later */ if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV4) mcast_flags |= BATADV_MCAST_WANT_NO_RTR4; if (mcast_flags & BATADV_MCAST_WANT_ALL_IPV6) mcast_flags |= BATADV_MCAST_WANT_NO_RTR6; return mcast_flags; } /** * batadv_mcast_tvlv_ogm_handler() - process incoming multicast tvlv container * @bat_priv: the bat priv with all the soft interface information * @orig: the orig_node of the ogm * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) * @tvlv_value: tvlv buffer containing the multicast data * @tvlv_value_len: tvlv buffer length */ static void batadv_mcast_tvlv_ogm_handler(struct batadv_priv *bat_priv, struct batadv_orig_node *orig, u8 flags, void *tvlv_value, u16 tvlv_value_len) { bool orig_mcast_enabled = !(flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND); u8 mcast_flags; mcast_flags = batadv_mcast_tvlv_flags_get(orig_mcast_enabled, tvlv_value, tvlv_value_len); spin_lock_bh(&orig->mcast_handler_lock); if (orig_mcast_enabled && !test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } else if (!orig_mcast_enabled && test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities)) { clear_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capabilities); } set_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig->capa_initialized); batadv_mcast_want_unsnoop_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_ipv6_update(bat_priv, orig, mcast_flags); batadv_mcast_want_rtr4_update(bat_priv, orig, mcast_flags); batadv_mcast_want_rtr6_update(bat_priv, orig, mcast_flags); orig->mcast_flags = mcast_flags; spin_unlock_bh(&orig->mcast_handler_lock); } /** * batadv_mcast_init() - initialize the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information */ void batadv_mcast_init(struct batadv_priv *bat_priv) { batadv_tvlv_handler_register(bat_priv, batadv_mcast_tvlv_ogm_handler, NULL, BATADV_TVLV_MCAST, 2, BATADV_TVLV_HANDLER_OGM_CIFNOTFND); INIT_DELAYED_WORK(&bat_priv->mcast.work, batadv_mcast_mla_update); batadv_mcast_start_timer(bat_priv); } /** * batadv_mcast_mesh_info_put() - put multicast info into a netlink message * @msg: buffer for the message * @bat_priv: the bat priv with all the soft interface information * * Return: 0 or error code. */ int batadv_mcast_mesh_info_put(struct sk_buff *msg, struct batadv_priv *bat_priv) { u32 flags = bat_priv->mcast.mla_flags.tvlv_flags; u32 flags_priv = BATADV_NO_FLAGS; if (bat_priv->mcast.mla_flags.bridged) { flags_priv |= BATADV_MCAST_FLAGS_BRIDGED; if (bat_priv->mcast.mla_flags.querier_ipv4.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_EXISTS; if (bat_priv->mcast.mla_flags.querier_ipv6.exists) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_EXISTS; if (bat_priv->mcast.mla_flags.querier_ipv4.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV4_SHADOWING; if (bat_priv->mcast.mla_flags.querier_ipv6.shadowing) flags_priv |= BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING; } if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, flags) || nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS_PRIV, flags_priv)) return -EMSGSIZE; return 0; } /** * batadv_mcast_flags_dump_entry() - dump one entry of the multicast flags table * to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @orig_node: originator to dump the multicast flags of * * Return: 0 or error code. */ static int batadv_mcast_flags_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_orig_node *orig_node) { void *hdr; hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_MCAST_FLAGS); if (!hdr) return -ENOBUFS; genl_dump_check_consistent(cb, hdr); if (nla_put(msg, BATADV_ATTR_ORIG_ADDRESS, ETH_ALEN, orig_node->orig)) { genlmsg_cancel(msg, hdr); return -EMSGSIZE; } if (test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capabilities)) { if (nla_put_u32(msg, BATADV_ATTR_MCAST_FLAGS, orig_node->mcast_flags)) { genlmsg_cancel(msg, hdr); return -EMSGSIZE; } } genlmsg_end(msg, hdr); return 0; } /** * batadv_mcast_flags_dump_bucket() - dump one bucket of the multicast flags * table to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @hash: hash to dump * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: 0 or error code. */ static int batadv_mcast_flags_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hashtable *hash, unsigned int bucket, long *idx_skip) { struct batadv_orig_node *orig_node; long idx = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(orig_node, &hash->table[bucket], hash_entry) { if (!test_bit(BATADV_ORIG_CAPA_HAS_MCAST, &orig_node->capa_initialized)) continue; if (idx < *idx_skip) goto skip; if (batadv_mcast_flags_dump_entry(msg, portid, cb, orig_node)) { spin_unlock_bh(&hash->list_locks[bucket]); *idx_skip = idx; return -EMSGSIZE; } skip: idx++; } spin_unlock_bh(&hash->list_locks[bucket]); return 0; } /** * __batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @bat_priv: the bat priv with all the soft interface information * @bucket: current bucket to dump * @idx: index in current bucket to the next entry to dump * * Return: 0 or error code. */ static int __batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_priv *bat_priv, long *bucket, long *idx) { struct batadv_hashtable *hash = bat_priv->orig_hash; long bucket_tmp = *bucket; long idx_tmp = *idx; while (bucket_tmp < hash->size) { if (batadv_mcast_flags_dump_bucket(msg, portid, cb, hash, bucket_tmp, &idx_tmp)) break; bucket_tmp++; idx_tmp = 0; } *bucket = bucket_tmp; *idx = idx_tmp; return msg->len; } /** * batadv_mcast_netlink_get_primary() - get primary interface from netlink * callback * @cb: netlink callback structure * @primary_if: the primary interface pointer to return the result in * * Return: 0 or error code. */ static int batadv_mcast_netlink_get_primary(struct netlink_callback *cb, struct batadv_hard_iface **primary_if) { struct batadv_hard_iface *hard_iface = NULL; struct net *net = sock_net(cb->skb->sk); struct net_device *soft_iface; struct batadv_priv *bat_priv; int ifindex; int ret = 0; ifindex = batadv_netlink_get_ifindex(cb->nlh, BATADV_ATTR_MESH_IFINDEX); if (!ifindex) return -EINVAL; soft_iface = dev_get_by_index(net, ifindex); if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { ret = -ENODEV; goto out; } bat_priv = netdev_priv(soft_iface); hard_iface = batadv_primary_if_get_selected(bat_priv); if (!hard_iface || hard_iface->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } out: dev_put(soft_iface); if (!ret && primary_if) *primary_if = hard_iface; else batadv_hardif_put(hard_iface); return ret; } /** * batadv_mcast_flags_dump() - dump multicast flags table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * * Return: message length. */ int batadv_mcast_flags_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; struct batadv_priv *bat_priv; long *bucket = &cb->args[0]; long *idx = &cb->args[1]; int ret; ret = batadv_mcast_netlink_get_primary(cb, &primary_if); if (ret) return ret; bat_priv = netdev_priv(primary_if->soft_iface); ret = __batadv_mcast_flags_dump(msg, portid, cb, bat_priv, bucket, idx); batadv_hardif_put(primary_if); return ret; } /** * batadv_mcast_free() - free the multicast optimizations structures * @bat_priv: the bat priv with all the soft interface information */ void batadv_mcast_free(struct batadv_priv *bat_priv) { cancel_delayed_work_sync(&bat_priv->mcast.work); batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 2); batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 2); /* safely calling outside of worker, as worker was canceled above */ batadv_mcast_mla_tt_retract(bat_priv, NULL); } /** * batadv_mcast_purge_orig() - reset originator global mcast state modifications * @orig: the originator which is going to get purged */ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) { struct batadv_priv *bat_priv = orig->bat_priv; spin_lock_bh(&orig->mcast_handler_lock); batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_rtr4_update(bat_priv, orig, BATADV_MCAST_WANT_NO_RTR4); batadv_mcast_want_rtr6_update(bat_priv, orig, BATADV_MCAST_WANT_NO_RTR6); spin_unlock_bh(&orig->mcast_handler_lock); }
2714 2311 689 517 701 701 703 689 688 692 249 248 704 704 700 705 702 2350 2351 2353 2349 2346 2348 2085 2049 17 2347 2349 3 2352 2354 2311 2311 2311 2314 2315 83 2347 2348 2354 17 2347 2349 2354 2351 2352 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 // SPDX-License-Identifier: GPL-2.0-only /* * Generic helpers for smp ipi calls * * (C) Jens Axboe <jens.axboe@oracle.com> 2008 */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/irq_work.h> #include <linux/rcupdate.h> #include <linux/rculist.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/gfp.h> #include <linux/smp.h> #include <linux/cpu.h> #include <linux/sched.h> #include <linux/sched/idle.h> #include <linux/hypervisor.h> #include <linux/sched/clock.h> #include <linux/nmi.h> #include <linux/sched/debug.h> #include <linux/jump_label.h> #include "smpboot.h" #include "sched/smp.h" #define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG union cfd_seq_cnt { u64 val; struct { u64 src:16; u64 dst:16; #define CFD_SEQ_NOCPU 0xffff u64 type:4; #define CFD_SEQ_QUEUE 0 #define CFD_SEQ_IPI 1 #define CFD_SEQ_NOIPI 2 #define CFD_SEQ_PING 3 #define CFD_SEQ_PINGED 4 #define CFD_SEQ_HANDLE 5 #define CFD_SEQ_DEQUEUE 6 #define CFD_SEQ_IDLE 7 #define CFD_SEQ_GOTIPI 8 #define CFD_SEQ_HDLEND 9 u64 cnt:28; } u; }; static char *seq_type[] = { [CFD_SEQ_QUEUE] = "queue", [CFD_SEQ_IPI] = "ipi", [CFD_SEQ_NOIPI] = "noipi", [CFD_SEQ_PING] = "ping", [CFD_SEQ_PINGED] = "pinged", [CFD_SEQ_HANDLE] = "handle", [CFD_SEQ_DEQUEUE] = "dequeue (src CPU 0 == empty)", [CFD_SEQ_IDLE] = "idle", [CFD_SEQ_GOTIPI] = "gotipi", [CFD_SEQ_HDLEND] = "hdlend (src CPU 0 == early)", }; struct cfd_seq_local { u64 ping; u64 pinged; u64 handle; u64 dequeue; u64 idle; u64 gotipi; u64 hdlend; }; #endif struct cfd_percpu { call_single_data_t csd; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG u64 seq_queue; u64 seq_ipi; u64 seq_noipi; #endif }; struct call_function_data { struct cfd_percpu __percpu *pcpu; cpumask_var_t cpumask; cpumask_var_t cpumask_ipi; }; static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data); static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue); static void flush_smp_call_function_queue(bool warn_cpu_offline); int smpcfd_prepare_cpu(unsigned int cpu) { struct call_function_data *cfd = &per_cpu(cfd_data, cpu); if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) return -ENOMEM; if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, cpu_to_node(cpu))) { free_cpumask_var(cfd->cpumask); return -ENOMEM; } cfd->pcpu = alloc_percpu(struct cfd_percpu); if (!cfd->pcpu) { free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); return -ENOMEM; } return 0; } int smpcfd_dead_cpu(unsigned int cpu) { struct call_function_data *cfd = &per_cpu(cfd_data, cpu); free_cpumask_var(cfd->cpumask); free_cpumask_var(cfd->cpumask_ipi); free_percpu(cfd->pcpu); return 0; } int smpcfd_dying_cpu(unsigned int cpu) { /* * The IPIs for the smp-call-function callbacks queued by other * CPUs might arrive late, either due to hardware latencies or * because this CPU disabled interrupts (inside stop-machine) * before the IPIs were sent. So flush out any pending callbacks * explicitly (without waiting for the IPIs to arrive), to * ensure that the outgoing CPU doesn't go offline with work * still pending. */ flush_smp_call_function_queue(false); irq_work_run(); return 0; } void __init call_function_init(void) { int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(call_single_queue, i)); smpcfd_prepare_cpu(smp_processor_id()); } #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG static DEFINE_STATIC_KEY_FALSE(csdlock_debug_enabled); static DEFINE_STATIC_KEY_FALSE(csdlock_debug_extended); static int __init csdlock_debug(char *str) { unsigned int val = 0; if (str && !strcmp(str, "ext")) { val = 1; static_branch_enable(&csdlock_debug_extended); } else get_option(&str, &val); if (val) static_branch_enable(&csdlock_debug_enabled); return 1; } __setup("csdlock_debug=", csdlock_debug); static DEFINE_PER_CPU(call_single_data_t *, cur_csd); static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func); static DEFINE_PER_CPU(void *, cur_csd_info); static DEFINE_PER_CPU(struct cfd_seq_local, cfd_seq_local); #define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC) static atomic_t csd_bug_count = ATOMIC_INIT(0); static u64 cfd_seq; #define CFD_SEQ(s, d, t, c) \ (union cfd_seq_cnt){ .u.src = s, .u.dst = d, .u.type = t, .u.cnt = c } static u64 cfd_seq_inc(unsigned int src, unsigned int dst, unsigned int type) { union cfd_seq_cnt new, old; new = CFD_SEQ(src, dst, type, 0); do { old.val = READ_ONCE(cfd_seq); new.u.cnt = old.u.cnt + 1; } while (cmpxchg(&cfd_seq, old.val, new.val) != old.val); return old.val; } #define cfd_seq_store(var, src, dst, type) \ do { \ if (static_branch_unlikely(&csdlock_debug_extended)) \ var = cfd_seq_inc(src, dst, type); \ } while (0) /* Record current CSD work for current CPU, NULL to erase. */ static void __csd_lock_record(struct __call_single_data *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ __this_cpu_write(cur_csd, NULL); return; } __this_cpu_write(cur_csd_func, csd->func); __this_cpu_write(cur_csd_info, csd->info); smp_wmb(); /* func and info before csd. */ __this_cpu_write(cur_csd, csd); smp_mb(); /* Update cur_csd before function call. */ /* Or before unlock, as the case may be. */ } static __always_inline void csd_lock_record(struct __call_single_data *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) __csd_lock_record(csd); } static int csd_lock_wait_getcpu(struct __call_single_data *csd) { unsigned int csd_type; csd_type = CSD_TYPE(csd); if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC) return csd->node.dst; /* Other CSD_TYPE_ values might not have ->dst. */ return -1; } static void cfd_seq_data_add(u64 val, unsigned int src, unsigned int dst, unsigned int type, union cfd_seq_cnt *data, unsigned int *n_data, unsigned int now) { union cfd_seq_cnt new[2]; unsigned int i, j, k; new[0].val = val; new[1] = CFD_SEQ(src, dst, type, new[0].u.cnt + 1); for (i = 0; i < 2; i++) { if (new[i].u.cnt <= now) new[i].u.cnt |= 0x80000000U; for (j = 0; j < *n_data; j++) { if (new[i].u.cnt == data[j].u.cnt) { /* Direct read value trumps generated one. */ if (i == 0) data[j].val = new[i].val; break; } if (new[i].u.cnt < data[j].u.cnt) { for (k = *n_data; k > j; k--) data[k].val = data[k - 1].val; data[j].val = new[i].val; (*n_data)++; break; } } if (j == *n_data) { data[j].val = new[i].val; (*n_data)++; } } } static const char *csd_lock_get_type(unsigned int type) { return (type >= ARRAY_SIZE(seq_type)) ? "?" : seq_type[type]; } static void csd_lock_print_extended(struct __call_single_data *csd, int cpu) { struct cfd_seq_local *seq = &per_cpu(cfd_seq_local, cpu); unsigned int srccpu = csd->node.src; struct call_function_data *cfd = per_cpu_ptr(&cfd_data, srccpu); struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); unsigned int now; union cfd_seq_cnt data[2 * ARRAY_SIZE(seq_type)]; unsigned int n_data = 0, i; data[0].val = READ_ONCE(cfd_seq); now = data[0].u.cnt; cfd_seq_data_add(pcpu->seq_queue, srccpu, cpu, CFD_SEQ_QUEUE, data, &n_data, now); cfd_seq_data_add(pcpu->seq_ipi, srccpu, cpu, CFD_SEQ_IPI, data, &n_data, now); cfd_seq_data_add(pcpu->seq_noipi, srccpu, cpu, CFD_SEQ_NOIPI, data, &n_data, now); cfd_seq_data_add(per_cpu(cfd_seq_local.ping, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PING, data, &n_data, now); cfd_seq_data_add(per_cpu(cfd_seq_local.pinged, srccpu), srccpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED, data, &n_data, now); cfd_seq_data_add(seq->idle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_IDLE, data, &n_data, now); cfd_seq_data_add(seq->gotipi, CFD_SEQ_NOCPU, cpu, CFD_SEQ_GOTIPI, data, &n_data, now); cfd_seq_data_add(seq->handle, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HANDLE, data, &n_data, now); cfd_seq_data_add(seq->dequeue, CFD_SEQ_NOCPU, cpu, CFD_SEQ_DEQUEUE, data, &n_data, now); cfd_seq_data_add(seq->hdlend, CFD_SEQ_NOCPU, cpu, CFD_SEQ_HDLEND, data, &n_data, now); for (i = 0; i < n_data; i++) { pr_alert("\tcsd: cnt(%07x): %04x->%04x %s\n", data[i].u.cnt & ~0x80000000U, data[i].u.src, data[i].u.dst, csd_lock_get_type(data[i].u.type)); } pr_alert("\tcsd: cnt now: %07x\n", now); } /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ static bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id) { int cpu = -1; int cpux; bool firsttime; u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; unsigned int flags = READ_ONCE(csd->node.u_flags); if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) return true; cpu = csd_lock_wait_getcpu(csd); pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", *bug_id, raw_smp_processor_id(), cpu); return true; } ts2 = sched_clock(); ts_delta = ts2 - *ts1; if (likely(ts_delta <= CSD_LOCK_TIMEOUT)) return false; firsttime = !*bug_id; if (firsttime) *bug_id = atomic_inc_return(&csd_bug_count); cpu = csd_lock_wait_getcpu(csd); if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu)) cpux = 0; else cpux = cpu; cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */ pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n", firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0, cpu, csd->func, csd->info); if (cpu_cur_csd && csd != cpu_cur_csd) { pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n", *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)), READ_ONCE(per_cpu(cur_csd_info, cpux))); } else { pr_alert("\tcsd: CSD lock (#%d) %s.\n", *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request"); } if (cpu >= 0) { if (static_branch_unlikely(&csdlock_debug_extended)) csd_lock_print_extended(csd, cpu); if (!trigger_single_cpu_backtrace(cpu)) dump_cpu_task(cpu); if (!cpu_cur_csd) { pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu); arch_send_call_function_single_ipi(cpu); } } dump_stack(); *ts1 = ts2; return false; } /* * csd_lock/csd_unlock used to serialize access to per-cpu csd resources * * For non-synchronous ipi calls the csd can still be in use by the * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ static void __csd_lock_wait(struct __call_single_data *csd) { int bug_id = 0; u64 ts0, ts1; ts1 = ts0 = sched_clock(); for (;;) { if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id)) break; cpu_relax(); } smp_acquire__after_ctrl_dep(); } static __always_inline void csd_lock_wait(struct __call_single_data *csd) { if (static_branch_unlikely(&csdlock_debug_enabled)) { __csd_lock_wait(csd); return; } smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } static void __smp_call_single_queue_debug(int cpu, struct llist_node *node) { unsigned int this_cpu = smp_processor_id(); struct cfd_seq_local *seq = this_cpu_ptr(&cfd_seq_local); struct call_function_data *cfd = this_cpu_ptr(&cfd_data); struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE); if (llist_add(node, &per_cpu(call_single_queue, cpu))) { cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI); cfd_seq_store(seq->ping, this_cpu, cpu, CFD_SEQ_PING); send_call_function_single_ipi(cpu); cfd_seq_store(seq->pinged, this_cpu, cpu, CFD_SEQ_PINGED); } else { cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI); } } #else #define cfd_seq_store(var, src, dst, type) static void csd_lock_record(struct __call_single_data *csd) { } static __always_inline void csd_lock_wait(struct __call_single_data *csd) { smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif static __always_inline void csd_lock(struct __call_single_data *csd) { csd_lock_wait(csd); csd->node.u_flags |= CSD_FLAG_LOCK; /* * prevent CPU from reordering the above assignment * to ->flags with any subsequent assignments to other * fields of the specified call_single_data_t structure: */ smp_wmb(); } static __always_inline void csd_unlock(struct __call_single_data *csd) { WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ smp_store_release(&csd->node.u_flags, 0); } static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); void __smp_call_single_queue(int cpu, struct llist_node *node) { #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG if (static_branch_unlikely(&csdlock_debug_extended)) { unsigned int type; type = CSD_TYPE(container_of(node, call_single_data_t, node.llist)); if (type == CSD_TYPE_SYNC || type == CSD_TYPE_ASYNC) { __smp_call_single_queue_debug(cpu, node); return; } } #endif /* * The list addition should be visible before sending the IPI * handler locks the list to pull the entry off it because of * normal cache coherency rules implied by spinlocks. * * If IPIs can go out of order to the cache coherency protocol * in an architecture, sufficient synchronisation should be added * to arch code to make it appear to obey cache coherency WRT * locking and barrier primitives. Generic code isn't really * equipped to do the right thing... */ if (llist_add(node, &per_cpu(call_single_queue, cpu))) send_call_function_single_ipi(cpu); } /* * Insert a previously allocated call_single_data_t element * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ static int generic_exec_single(int cpu, struct __call_single_data *csd) { if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; void *info = csd->info; unsigned long flags; /* * We can unlock early even for the synchronous on-stack case, * since we're doing this from the same CPU.. */ csd_lock_record(csd); csd_unlock(csd); local_irq_save(flags); func(info); csd_lock_record(NULL); local_irq_restore(flags); return 0; } if ((unsigned)cpu >= nr_cpu_ids || !cpu_online(cpu)) { csd_unlock(csd); return -ENXIO; } __smp_call_single_queue(cpu, &csd->node.llist); return 0; } /** * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks * * Invoked by arch to handle an IPI for call function single. * Must be called with interrupts disabled. */ void generic_smp_call_function_single_interrupt(void) { cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_GOTIPI); flush_smp_call_function_queue(true); } /** * flush_smp_call_function_queue - Flush pending smp-call-function callbacks * * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an * offline CPU. Skip this check if set to 'false'. * * Flush any pending smp-call-function callbacks queued on this CPU. This is * invoked by the generic IPI handler, as well as by a CPU about to go offline, * to ensure that all pending IPI callbacks are run before it goes completely * offline. * * Loop through the call_single_queue and run all the queued callbacks. * Must be called with interrupts disabled. */ static void flush_smp_call_function_queue(bool warn_cpu_offline) { call_single_data_t *csd, *csd_next; struct llist_node *entry, *prev; struct llist_head *head; static bool warned; lockdep_assert_irqs_disabled(); head = this_cpu_ptr(&call_single_queue); cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->handle, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_HANDLE); entry = llist_del_all(head); cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->dequeue, /* Special meaning of source cpu: 0 == queue empty */ entry ? CFD_SEQ_NOCPU : 0, smp_processor_id(), CFD_SEQ_DEQUEUE); entry = llist_reverse_order(entry); /* There shouldn't be any pending callbacks on an offline CPU. */ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) && !warned && entry != NULL)) { warned = true; WARN(1, "IPI on offline CPU %d\n", smp_processor_id()); /* * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ llist_for_each_entry(csd, entry, node.llist) { switch (CSD_TYPE(csd)) { case CSD_TYPE_ASYNC: case CSD_TYPE_SYNC: case CSD_TYPE_IRQ_WORK: pr_warn("IPI callback %pS sent to offline CPU\n", csd->func); break; case CSD_TYPE_TTWU: pr_warn("IPI task-wakeup sent to offline CPU\n"); break; default: pr_warn("IPI callback, unknown type %d, sent to offline CPU\n", CSD_TYPE(csd)); break; } } } /* * First; run all SYNC callbacks, people are waiting for us. */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { /* Do we wait until *after* callback? */ if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { smp_call_func_t func = csd->func; void *info = csd->info; if (prev) { prev->next = &csd_next->node.llist; } else { entry = &csd_next->node.llist; } csd_lock_record(csd); func(info); csd_unlock(csd); csd_lock_record(NULL); } else { prev = &csd->node.llist; } } if (!entry) { cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend, 0, smp_processor_id(), CFD_SEQ_HDLEND); return; } /* * Second; run all !SYNC callbacks. */ prev = NULL; llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { int type = CSD_TYPE(csd); if (type != CSD_TYPE_TTWU) { if (prev) { prev->next = &csd_next->node.llist; } else { entry = &csd_next->node.llist; } if (type == CSD_TYPE_ASYNC) { smp_call_func_t func = csd->func; void *info = csd->info; csd_lock_record(csd); csd_unlock(csd); func(info); csd_lock_record(NULL); } else if (type == CSD_TYPE_IRQ_WORK) { irq_work_single(csd); } } else { prev = &csd->node.llist; } } /* * Third; only CSD_TYPE_TTWU is left, issue those. */ if (entry) sched_ttwu_pending(entry); cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->hdlend, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_HDLEND); } void flush_smp_call_function_from_idle(void) { unsigned long flags; if (llist_empty(this_cpu_ptr(&call_single_queue))) return; cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, smp_processor_id(), CFD_SEQ_IDLE); local_irq_save(flags); flush_smp_call_function_queue(true); if (local_softirq_pending()) do_softirq(); local_irq_restore(flags); } /* * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed on other CPUs. * * Returns 0 on success, else a negative status code. */ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, int wait) { call_single_data_t *csd; call_single_data_t csd_stack = { .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }, }; int this_cpu; int err; /* * prevent preemption and reschedule on another processor, * as well as CPU removal */ this_cpu = get_cpu(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); /* * When @wait we can deadlock when we interrupt between llist_add() and * arch_send_call_function_ipi*(); when !@wait we can deadlock due to * csd_lock() on because the interrupt context uses the same csd * storage. */ WARN_ON_ONCE(!in_task()); csd = &csd_stack; if (!wait) { csd = this_cpu_ptr(&csd_data); csd_lock(csd); } csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif err = generic_exec_single(cpu, csd); if (wait) csd_lock_wait(csd); put_cpu(); return err; } EXPORT_SYMBOL(smp_call_function_single); /** * smp_call_function_single_async() - Run an asynchronous function on a * specific CPU. * @cpu: The CPU to run on. * @csd: Pre-allocated and setup data structure * * Like smp_call_function_single(), but the call is asynchonous and * can thus be done from contexts with disabled interrupts. * * The caller passes his own pre-allocated data structure * (ie: embedded in an object) and is responsible for synchronizing it * such that the IPIs performed on the @csd are strictly serialized. * * If the function is called with one csd which has not yet been * processed by previous call to smp_call_function_single_async(), the * function will return immediately with -EBUSY showing that the csd * object is still in progress. * * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. * * Return: %0 on success or negative errno value on error */ int smp_call_function_single_async(int cpu, struct __call_single_data *csd) { int err = 0; preempt_disable(); if (csd->node.u_flags & CSD_FLAG_LOCK) { err = -EBUSY; goto out; } csd->node.u_flags = CSD_FLAG_LOCK; smp_wmb(); err = generic_exec_single(cpu, csd); out: preempt_enable(); return err; } EXPORT_SYMBOL_GPL(smp_call_function_single_async); /* * smp_call_function_any - Run a function on any of the given cpus * @mask: The mask of cpus it can run on. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait until function has completed. * * Returns 0 on success, else a negative status code (if no cpus were online). * * Selection preference: * 1) current cpu if in @mask * 2) any cpu of current node if in @mask * 3) any other online cpu in @mask */ int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait) { unsigned int cpu; const struct cpumask *nodemask; int ret; /* Try for same CPU (cheapest) */ cpu = get_cpu(); if (cpumask_test_cpu(cpu, mask)) goto call; /* Try for same node. */ nodemask = cpumask_of_node(cpu_to_node(cpu)); for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; cpu = cpumask_next_and(cpu, nodemask, mask)) { if (cpu_online(cpu)) goto call; } /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ cpu = cpumask_any_and(mask, cpu_online_mask); call: ret = smp_call_function_single(cpu, func, info, wait); put_cpu(); return ret; } EXPORT_SYMBOL_GPL(smp_call_function_any); /* * Flags to be used as scf_flags argument of smp_call_function_many_cond(). * * %SCF_WAIT: Wait until function execution is completed * %SCF_RUN_LOCAL: Run also locally if local cpu is set in cpumask */ #define SCF_WAIT (1U << 0) #define SCF_RUN_LOCAL (1U << 1) static void smp_call_function_many_cond(const struct cpumask *mask, smp_call_func_t func, void *info, unsigned int scf_flags, smp_cond_func_t cond_func) { int cpu, last_cpu, this_cpu = smp_processor_id(); struct call_function_data *cfd; bool wait = scf_flags & SCF_WAIT; bool run_remote = false; bool run_local = false; int nr_cpus = 0; lockdep_assert_preemption_disabled(); /* * Can deadlock when called with interrupts disabled. * We allow cpu's that are not yet online though, as no one else can * send smp call function interrupt to this cpu and as such deadlocks * can't happen. */ if (cpu_online(this_cpu) && !oops_in_progress && !early_boot_irqs_disabled) lockdep_assert_irqs_enabled(); /* * When @wait we can deadlock when we interrupt between llist_add() and * arch_send_call_function_ipi*(); when !@wait we can deadlock due to * csd_lock() on because the interrupt context uses the same csd * storage. */ WARN_ON_ONCE(!in_task()); /* Check if we need local execution. */ if ((scf_flags & SCF_RUN_LOCAL) && cpumask_test_cpu(this_cpu, mask)) run_local = true; /* Check if we need remote execution, i.e., any CPU excluding this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); if (cpu == this_cpu) cpu = cpumask_next_and(cpu, mask, cpu_online_mask); if (cpu < nr_cpu_ids) run_remote = true; if (run_remote) { cfd = this_cpu_ptr(&cfd_data); cpumask_and(cfd->cpumask, mask, cpu_online_mask); __cpumask_clear_cpu(this_cpu, cfd->cpumask); cpumask_clear(cfd->cpumask_ipi); for_each_cpu(cpu, cfd->cpumask) { struct cfd_percpu *pcpu = per_cpu_ptr(cfd->pcpu, cpu); call_single_data_t *csd = &pcpu->csd; if (cond_func && !cond_func(cpu, info)) continue; csd_lock(csd); if (wait) csd->node.u_flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG csd->node.src = smp_processor_id(); csd->node.dst = cpu; #endif cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE); if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) { __cpumask_set_cpu(cpu, cfd->cpumask_ipi); nr_cpus++; last_cpu = cpu; cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI); } else { cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI); } } cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->ping, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PING); /* * Choose the most efficient way to send an IPI. Note that the * number of CPUs might be zero due to concurrent changes to the * provided mask. */ if (nr_cpus == 1) send_call_function_single_ipi(last_cpu); else if (likely(nr_cpus > 1)) arch_send_call_function_ipi_mask(cfd->cpumask_ipi); cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->pinged, this_cpu, CFD_SEQ_NOCPU, CFD_SEQ_PINGED); } if (run_local && (!cond_func || cond_func(this_cpu, info))) { unsigned long flags; local_irq_save(flags); func(info); local_irq_restore(flags); } if (run_remote && wait) { for_each_cpu(cpu, cfd->cpumask) { call_single_data_t *csd; csd = &per_cpu_ptr(cfd->pcpu, cpu)->csd; csd_lock_wait(csd); } } } /** * smp_call_function_many(): Run a function on a set of CPUs. * @mask: The set of cpus to run on (only runs on online subset). * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait * (atomically) until function has completed on other CPUs. If * %SCF_RUN_LOCAL is set, the function will also be run locally * if the local CPU is set in the @cpumask. * * If @wait is true, then returns once @func has returned. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. Preemption * must be disabled when calling this function. */ void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) { smp_call_function_many_cond(mask, func, info, wait * SCF_WAIT, NULL); } EXPORT_SYMBOL(smp_call_function_many); /** * smp_call_function(): Run a function on all other CPUs. * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @wait: If true, wait (atomically) until function has completed * on other CPUs. * * Returns 0. * * If @wait is true, then returns once @func has returned; otherwise * it returns just before the target cpu calls @func. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ void smp_call_function(smp_call_func_t func, void *info, int wait) { preempt_disable(); smp_call_function_many(cpu_online_mask, func, info, wait); preempt_enable(); } EXPORT_SYMBOL(smp_call_function); /* Setup configured maximum number of CPUs to activate */ unsigned int setup_max_cpus = NR_CPUS; EXPORT_SYMBOL(setup_max_cpus); /* * Setup routine for controlling SMP activation * * Command-line option of "nosmp" or "maxcpus=0" will disable SMP * activation entirely (the MPS table probe still happens, though). * * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer * greater than 0, limits the maximum number of CPUs activated in * SMP mode to <NUM>. */ void __weak arch_disable_smp_support(void) { } static int __init nosmp(char *str) { setup_max_cpus = 0; arch_disable_smp_support(); return 0; } early_param("nosmp", nosmp); /* this is hard limit */ static int __init nrcpus(char *str) { int nr_cpus; if (get_option(&str, &nr_cpus) && nr_cpus > 0 && nr_cpus < nr_cpu_ids) nr_cpu_ids = nr_cpus; return 0; } early_param("nr_cpus", nrcpus); static int __init maxcpus(char *str) { get_option(&str, &setup_max_cpus); if (setup_max_cpus == 0) arch_disable_smp_support(); return 0; } early_param("maxcpus", maxcpus); /* Setup number of possible processor ids */ unsigned int nr_cpu_ids __read_mostly = NR_CPUS; EXPORT_SYMBOL(nr_cpu_ids); /* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ void __init setup_nr_cpu_ids(void) { nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; } /* Called by boot processor to activate the rest. */ void __init smp_init(void) { int num_nodes, num_cpus; idle_threads_init(); cpuhp_threads_init(); pr_info("Bringing up secondary CPUs ...\n"); bringup_nonboot_cpus(setup_max_cpus); num_nodes = num_online_nodes(); num_cpus = num_online_cpus(); pr_info("Brought up %d node%s, %d CPU%s\n", num_nodes, (num_nodes > 1 ? "s" : ""), num_cpus, (num_cpus > 1 ? "s" : "")); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); } /* * on_each_cpu_cond(): Call a function on each processor for which * the supplied function cond_func returns true, optionally waiting * for all the required CPUs to finish. This may include the local * processor. * @cond_func: A callback function that is passed a cpu id and * the info parameter. The function is called * with preemption disabled. The function should * return a blooean value indicating whether to IPI * the specified CPU. * @func: The function to run on all applicable CPUs. * This must be fast and non-blocking. * @info: An arbitrary pointer to pass to both functions. * @wait: If true, wait (atomically) until function has * completed on other CPUs. * * Preemption is disabled to protect against CPUs going offline but not online. * CPUs going online during the call will not be seen or sent an IPI. * * You must not call this function with disabled interrupts or * from a hardware interrupt handler or from a bottom half handler. */ void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask) { unsigned int scf_flags = SCF_RUN_LOCAL; if (wait) scf_flags |= SCF_WAIT; preempt_disable(); smp_call_function_many_cond(mask, func, info, scf_flags, cond_func); preempt_enable(); } EXPORT_SYMBOL(on_each_cpu_cond_mask); static void do_nothing(void *unused) { } /** * kick_all_cpus_sync - Force all cpus out of idle * * Used to synchronize the update of pm_idle function pointer. It's * called after the pointer is updated and returns after the dummy * callback function has been executed on all cpus. The execution of * the function can only happen on the remote cpus after they have * left the idle function which had been called via pm_idle function * pointer. So it's guaranteed that nothing uses the previous pointer * anymore. */ void kick_all_cpus_sync(void) { /* Make sure the change is visible before we kick the cpus */ smp_mb(); smp_call_function(do_nothing, NULL, 1); } EXPORT_SYMBOL_GPL(kick_all_cpus_sync); /** * wake_up_all_idle_cpus - break all cpus out of idle * wake_up_all_idle_cpus try to break all cpus which is in idle state even * including idle polling cpus, for non-idle cpus, we will do nothing * for them. */ void wake_up_all_idle_cpus(void) { int cpu; preempt_disable(); for_each_online_cpu(cpu) { if (cpu == smp_processor_id()) continue; wake_up_if_idle(cpu); } preempt_enable(); } EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus); /** * struct smp_call_on_cpu_struct - Call a function on a specific CPU * @work: &work_struct * @done: &completion to signal * @func: function to call * @data: function's data argument * @ret: return value from @func * @cpu: target CPU (%-1 for any CPU) * * Used to call a function on a specific cpu and wait for it to return. * Optionally make sure the call is done on a specified physical cpu via vcpu * pinning in order to support virtualized environments. */ struct smp_call_on_cpu_struct { struct work_struct work; struct completion done; int (*func)(void *); void *data; int ret; int cpu; }; static void smp_call_on_cpu_callback(struct work_struct *work) { struct smp_call_on_cpu_struct *sscs; sscs = container_of(work, struct smp_call_on_cpu_struct, work); if (sscs->cpu >= 0) hypervisor_pin_vcpu(sscs->cpu); sscs->ret = sscs->func(sscs->data); if (sscs->cpu >= 0) hypervisor_pin_vcpu(-1); complete(&sscs->done); } int smp_call_on_cpu(unsigned int cpu, int (*func)(void *), void *par, bool phys) { struct smp_call_on_cpu_struct sscs = { .done = COMPLETION_INITIALIZER_ONSTACK(sscs.done), .func = func, .data = par, .cpu = phys ? cpu : -1, }; INIT_WORK_ONSTACK(&sscs.work, smp_call_on_cpu_callback); if (cpu >= nr_cpu_ids || !cpu_online(cpu)) return -ENXIO; queue_work_on(cpu, system_wq, &sscs.work); wait_for_completion(&sscs.done); destroy_work_on_stack(&sscs.work); return sscs.ret; } EXPORT_SYMBOL_GPL(smp_call_on_cpu);
196 164 166 166 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 // SPDX-License-Identifier: GPL-2.0-only /* * User address space access functions. * * Copyright 1997 Andi Kleen <ak@muc.de> * Copyright 1997 Linus Torvalds * Copyright 2002 Andi Kleen <ak@suse.de> */ #include <linux/export.h> #include <linux/uaccess.h> #include <linux/highmem.h> /* * Zero Userspace */ unsigned long __clear_user(void __user *addr, unsigned long size) { long __d0; might_fault(); /* no memory constraint because it doesn't change any memory gcc knows about */ stac(); asm volatile( " testq %[size8],%[size8]\n" " jz 4f\n" " .align 16\n" "0: movq $0,(%[dst])\n" " addq $8,%[dst]\n" " decl %%ecx ; jnz 0b\n" "4: movq %[size1],%%rcx\n" " testl %%ecx,%%ecx\n" " jz 2f\n" "1: movb $0,(%[dst])\n" " incq %[dst]\n" " decl %%ecx ; jnz 1b\n" "2:\n" ".section .fixup,\"ax\"\n" "3: lea 0(%[size1],%[size8],8),%[size8]\n" " jmp 2b\n" ".previous\n" _ASM_EXTABLE_UA(0b, 3b) _ASM_EXTABLE_UA(1b, 2b) : [size8] "=&c"(size), [dst] "=&D" (__d0) : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr)); clac(); return size; } EXPORT_SYMBOL(__clear_user); unsigned long clear_user(void __user *to, unsigned long n) { if (access_ok(to, n)) return __clear_user(to, n); return n; } EXPORT_SYMBOL(clear_user); #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /** * clean_cache_range - write back a cache range with CLWB * @vaddr: virtual start address * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) * instruction. Note that @size is internally rounded up to be cache * line size aligned. */ static void clean_cache_range(void *addr, size_t size) { u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; unsigned long clflush_mask = x86_clflush_size - 1; void *vend = addr + size; void *p; for (p = (void *)((unsigned long)addr & ~clflush_mask); p < vend; p += x86_clflush_size) clwb(p); } void arch_wb_cache_pmem(void *addr, size_t size) { clean_cache_range(addr, size); } EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); long __copy_user_flushcache(void *dst, const void __user *src, unsigned size) { unsigned long flushed, dest = (unsigned long) dst; long rc = __copy_user_nocache(dst, src, size, 0); /* * __copy_user_nocache() uses non-temporal stores for the bulk * of the transfer, but we need to manually flush if the * transfer is unaligned. A cached memory copy is used when * destination or size is not naturally aligned. That is: * - Require 8-byte alignment when size is 8 bytes or larger. * - Require 4-byte alignment when size is 4 bytes. */ if (size < 8) { if (!IS_ALIGNED(dest, 4) || size != 4) clean_cache_range(dst, size); } else { if (!IS_ALIGNED(dest, 8)) { dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); clean_cache_range(dst, 1); } flushed = dest - (unsigned long) dst; if (size > flushed && !IS_ALIGNED(size - flushed, 8)) clean_cache_range(dst + size - 1, 1); } return rc; } void __memcpy_flushcache(void *_dst, const void *_src, size_t size) { unsigned long dest = (unsigned long) _dst; unsigned long source = (unsigned long) _src; /* cache copy and flush to align dest */ if (!IS_ALIGNED(dest, 8)) { size_t len = min_t(size_t, size, ALIGN(dest, 8) - dest); memcpy((void *) dest, (void *) source, len); clean_cache_range((void *) dest, len); dest += len; source += len; size -= len; if (!size) return; } /* 4x8 movnti loop */ while (size >= 32) { asm("movq (%0), %%r8\n" "movq 8(%0), %%r9\n" "movq 16(%0), %%r10\n" "movq 24(%0), %%r11\n" "movnti %%r8, (%1)\n" "movnti %%r9, 8(%1)\n" "movnti %%r10, 16(%1)\n" "movnti %%r11, 24(%1)\n" :: "r" (source), "r" (dest) : "memory", "r8", "r9", "r10", "r11"); dest += 32; source += 32; size -= 32; } /* 1x8 movnti loop */ while (size >= 8) { asm("movq (%0), %%r8\n" "movnti %%r8, (%1)\n" :: "r" (source), "r" (dest) : "memory", "r8"); dest += 8; source += 8; size -= 8; } /* 1x4 movnti loop */ while (size >= 4) { asm("movl (%0), %%r8d\n" "movnti %%r8d, (%1)\n" :: "r" (source), "r" (dest) : "memory", "r8"); dest += 4; source += 4; size -= 4; } /* cache copy for remaining bytes */ if (size) { memcpy((void *) dest, (void *) source, size); clean_cache_range((void *) dest, size); } } EXPORT_SYMBOL_GPL(__memcpy_flushcache); void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len) { char *from = kmap_atomic(page); memcpy_flushcache(to, from + offset, len); kunmap_atomic(from); } #endif
409 409 409 410 409 120 120 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 // SPDX-License-Identifier: GPL-2.0 /* * bus.c - bus driver management * * Copyright (c) 2002-3 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2007 Novell Inc. */ #include <linux/async.h> #include <linux/device/bus.h> #include <linux/device.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/string.h> #include <linux/mutex.h> #include <linux/sysfs.h> #include "base.h" #include "power/power.h" /* /sys/devices/system */ static struct kset *system_kset; #define to_bus_attr(_attr) container_of(_attr, struct bus_attribute, attr) /* * sysfs bindings for drivers */ #define to_drv_attr(_attr) container_of(_attr, struct driver_attribute, attr) #define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ struct driver_attribute driver_attr_##_name = \ __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) static int __must_check bus_rescan_devices_helper(struct device *dev, void *data); static struct bus_type *bus_get(struct bus_type *bus) { if (bus) { kset_get(&bus->p->subsys); return bus; } return NULL; } static void bus_put(struct bus_type *bus) { if (bus) kset_put(&bus->p->subsys); } static ssize_t drv_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->show) ret = drv_attr->show(drv_priv->driver, buf); return ret; } static ssize_t drv_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->store) ret = drv_attr->store(drv_priv->driver, buf, count); return ret; } static const struct sysfs_ops driver_sysfs_ops = { .show = drv_attr_show, .store = drv_attr_store, }; static void driver_release(struct kobject *kobj) { struct driver_private *drv_priv = to_driver(kobj); pr_debug("driver: '%s': %s\n", kobject_name(kobj), __func__); kfree(drv_priv); } static struct kobj_type driver_ktype = { .sysfs_ops = &driver_sysfs_ops, .release = driver_release, }; /* * sysfs bindings for buses */ static ssize_t bus_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); /* return -EIO for reading a bus attribute without show() */ ssize_t ret = -EIO; if (bus_attr->show) ret = bus_attr->show(subsys_priv->bus, buf); return ret; } static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); /* return -EIO for writing a bus attribute without store() */ ssize_t ret = -EIO; if (bus_attr->store) ret = bus_attr->store(subsys_priv->bus, buf, count); return ret; } static const struct sysfs_ops bus_sysfs_ops = { .show = bus_attr_show, .store = bus_attr_store, }; int bus_create_file(struct bus_type *bus, struct bus_attribute *attr) { int error; if (bus_get(bus)) { error = sysfs_create_file(&bus->p->subsys.kobj, &attr->attr); bus_put(bus); } else error = -EINVAL; return error; } EXPORT_SYMBOL_GPL(bus_create_file); void bus_remove_file(struct bus_type *bus, struct bus_attribute *attr) { if (bus_get(bus)) { sysfs_remove_file(&bus->p->subsys.kobj, &attr->attr); bus_put(bus); } } EXPORT_SYMBOL_GPL(bus_remove_file); static void bus_release(struct kobject *kobj) { struct subsys_private *priv = to_subsys_private(kobj); struct bus_type *bus = priv->bus; kfree(priv); bus->p = NULL; } static struct kobj_type bus_ktype = { .sysfs_ops = &bus_sysfs_ops, .release = bus_release, }; static int bus_uevent_filter(struct kset *kset, struct kobject *kobj) { struct kobj_type *ktype = get_ktype(kobj); if (ktype == &bus_ktype) return 1; return 0; } static const struct kset_uevent_ops bus_uevent_ops = { .filter = bus_uevent_filter, }; static struct kset *bus_kset; /* Manually detach a device from its associated driver. */ static ssize_t unbind_store(struct device_driver *drv, const char *buf, size_t count) { struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == drv) { device_driver_detach(dev); err = count; } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, 0200, NULL, unbind_store); /* * Manually attach a device to a driver. * Note: the driver must want to bind to the device, * it is not possible to override the driver's id table. */ static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t count) { struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && driver_match_device(drv, dev)) { err = device_driver_attach(drv, dev); if (!err) { /* success */ err = count; } } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(bind, 0200, NULL, bind_store); static ssize_t drivers_autoprobe_show(struct bus_type *bus, char *buf) { return sysfs_emit(buf, "%d\n", bus->p->drivers_autoprobe); } static ssize_t drivers_autoprobe_store(struct bus_type *bus, const char *buf, size_t count) { if (buf[0] == '0') bus->p->drivers_autoprobe = 0; else bus->p->drivers_autoprobe = 1; return count; } static ssize_t drivers_probe_store(struct bus_type *bus, const char *buf, size_t count) { struct device *dev; int err = -EINVAL; dev = bus_find_device_by_name(bus, NULL, buf); if (!dev) return -ENODEV; if (bus_rescan_devices_helper(dev, NULL) == 0) err = count; put_device(dev); return err; } static struct device *next_device(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct device *dev = NULL; struct device_private *dev_prv; if (n) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; } return dev; } /** * bus_for_each_dev - device iterator. * @bus: bus type. * @start: device to start iterating from. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over @bus's list of devices, and call @fn for each, * passing it @data. If @start is not NULL, we use that device to * begin iterating from. * * We check the return of @fn each time. If it returns anything * other than 0, we break out and return that value. * * NOTE: The device that returns a non-zero value is not retained * in any way, nor is its refcount incremented. If the caller needs * to retain this data, it should do so, and increment the reference * count in the supplied callback. */ int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data, int (*fn)(struct device *, void *)) { struct klist_iter i; struct device *dev; int error = 0; if (!bus || !bus->p) return -EINVAL; klist_iter_init_node(&bus->p->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while (!error && (dev = next_device(&i))) error = fn(dev, data); klist_iter_exit(&i); return error; } EXPORT_SYMBOL_GPL(bus_for_each_dev); /** * bus_find_device - device iterator for locating a particular device. * @bus: bus type * @start: Device to begin with * @data: Data to pass to match function * @match: Callback function to check device * * This is similar to the bus_for_each_dev() function above, but it * returns a reference to a device that is 'found' for later use, as * determined by the @match callback. * * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. */ struct device *bus_find_device(struct bus_type *bus, struct device *start, const void *data, int (*match)(struct device *dev, const void *data)) { struct klist_iter i; struct device *dev; if (!bus || !bus->p) return NULL; klist_iter_init_node(&bus->p->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while ((dev = next_device(&i))) if (match(dev, data) && get_device(dev)) break; klist_iter_exit(&i); return dev; } EXPORT_SYMBOL_GPL(bus_find_device); /** * subsys_find_device_by_id - find a device with a specific enumeration number * @subsys: subsystem * @id: index 'id' in struct device * @hint: device to check first * * Check the hint's next object and if it is a match return it directly, * otherwise, fall back to a full list search. Either way a reference for * the returned object is taken. */ struct device *subsys_find_device_by_id(struct bus_type *subsys, unsigned int id, struct device *hint) { struct klist_iter i; struct device *dev; if (!subsys) return NULL; if (hint) { klist_iter_init_node(&subsys->p->klist_devices, &i, &hint->p->knode_bus); dev = next_device(&i); if (dev && dev->id == id && get_device(dev)) { klist_iter_exit(&i); return dev; } klist_iter_exit(&i); } klist_iter_init_node(&subsys->p->klist_devices, &i, NULL); while ((dev = next_device(&i))) { if (dev->id == id && get_device(dev)) { klist_iter_exit(&i); return dev; } } klist_iter_exit(&i); return NULL; } EXPORT_SYMBOL_GPL(subsys_find_device_by_id); static struct device_driver *next_driver(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct driver_private *drv_priv; if (n) { drv_priv = container_of(n, struct driver_private, knode_bus); return drv_priv->driver; } return NULL; } /** * bus_for_each_drv - driver iterator * @bus: bus we're dealing with. * @start: driver to start iterating on. * @data: data to pass to the callback. * @fn: function to call for each driver. * * This is nearly identical to the device iterator above. * We iterate over each driver that belongs to @bus, and call * @fn for each. If @fn returns anything but 0, we break out * and return it. If @start is not NULL, we use it as the head * of the list. * * NOTE: we don't return the driver that returns a non-zero * value, nor do we leave the reference count incremented for that * driver. If the caller needs to know that info, it must set it * in the callback. It must also be sure to increment the refcount * so it doesn't disappear before returning to the caller. */ int bus_for_each_drv(struct bus_type *bus, struct device_driver *start, void *data, int (*fn)(struct device_driver *, void *)) { struct klist_iter i; struct device_driver *drv; int error = 0; if (!bus) return -EINVAL; klist_iter_init_node(&bus->p->klist_drivers, &i, start ? &start->p->knode_bus : NULL); while ((drv = next_driver(&i)) && !error) error = fn(drv, data); klist_iter_exit(&i); return error; } EXPORT_SYMBOL_GPL(bus_for_each_drv); /** * bus_add_device - add device to bus * @dev: device being added * * - Add device's bus attributes. * - Create links to device's bus. * - Add the device to its bus's list of devices. */ int bus_add_device(struct device *dev) { struct bus_type *bus = bus_get(dev->bus); int error = 0; if (bus) { pr_debug("bus: '%s': add device %s\n", bus->name, dev_name(dev)); error = device_add_groups(dev, bus->dev_groups); if (error) goto out_put; error = sysfs_create_link(&bus->p->devices_kset->kobj, &dev->kobj, dev_name(dev)); if (error) goto out_groups; error = sysfs_create_link(&dev->kobj, &dev->bus->p->subsys.kobj, "subsystem"); if (error) goto out_subsys; klist_add_tail(&dev->p->knode_bus, &bus->p->klist_devices); } return 0; out_subsys: sysfs_remove_link(&bus->p->devices_kset->kobj, dev_name(dev)); out_groups: device_remove_groups(dev, bus->dev_groups); out_put: bus_put(dev->bus); return error; } /** * bus_probe_device - probe drivers for a new device * @dev: device to probe * * - Automatically probe for a driver if the bus allows it. */ void bus_probe_device(struct device *dev) { struct bus_type *bus = dev->bus; struct subsys_interface *sif; if (!bus) return; if (bus->p->drivers_autoprobe) device_initial_probe(dev); mutex_lock(&bus->p->mutex); list_for_each_entry(sif, &bus->p->interfaces, node) if (sif->add_dev) sif->add_dev(dev, sif); mutex_unlock(&bus->p->mutex); } /** * bus_remove_device - remove device from bus * @dev: device to be removed * * - Remove device from all interfaces. * - Remove symlink from bus' directory. * - Delete device from bus's list. * - Detach from its driver. * - Drop reference taken in bus_add_device(). */ void bus_remove_device(struct device *dev) { struct bus_type *bus = dev->bus; struct subsys_interface *sif; if (!bus) return; mutex_lock(&bus->p->mutex); list_for_each_entry(sif, &bus->p->interfaces, node) if (sif->remove_dev) sif->remove_dev(dev, sif); mutex_unlock(&bus->p->mutex); sysfs_remove_link(&dev->kobj, "subsystem"); sysfs_remove_link(&dev->bus->p->devices_kset->kobj, dev_name(dev)); device_remove_groups(dev, dev->bus->dev_groups); if (klist_node_attached(&dev->p->knode_bus)) klist_del(&dev->p->knode_bus); pr_debug("bus: '%s': remove device %s\n", dev->bus->name, dev_name(dev)); device_release_driver(dev); bus_put(dev->bus); } static int __must_check add_bind_files(struct device_driver *drv) { int ret; ret = driver_create_file(drv, &driver_attr_unbind); if (ret == 0) { ret = driver_create_file(drv, &driver_attr_bind); if (ret) driver_remove_file(drv, &driver_attr_unbind); } return ret; } static void remove_bind_files(struct device_driver *drv) { driver_remove_file(drv, &driver_attr_bind); driver_remove_file(drv, &driver_attr_unbind); } static BUS_ATTR_WO(drivers_probe); static BUS_ATTR_RW(drivers_autoprobe); static int add_probe_files(struct bus_type *bus) { int retval; retval = bus_create_file(bus, &bus_attr_drivers_probe); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_drivers_autoprobe); if (retval) bus_remove_file(bus, &bus_attr_drivers_probe); out: return retval; } static void remove_probe_files(struct bus_type *bus) { bus_remove_file(bus, &bus_attr_drivers_autoprobe); bus_remove_file(bus, &bus_attr_drivers_probe); } static ssize_t uevent_store(struct device_driver *drv, const char *buf, size_t count) { int rc; rc = kobject_synth_uevent(&drv->p->kobj, buf, count); return rc ? rc : count; } static DRIVER_ATTR_WO(uevent); /** * bus_add_driver - Add a driver to the bus. * @drv: driver. */ int bus_add_driver(struct device_driver *drv) { struct bus_type *bus; struct driver_private *priv; int error = 0; bus = bus_get(drv->bus); if (!bus) return -EINVAL; pr_debug("bus: '%s': add driver %s\n", bus->name, drv->name); priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) { error = -ENOMEM; goto out_put_bus; } klist_init(&priv->klist_devices, NULL, NULL); priv->driver = drv; drv->p = priv; priv->kobj.kset = bus->p->drivers_kset; error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL, "%s", drv->name); if (error) goto out_unregister; klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers); if (drv->bus->p->drivers_autoprobe) { error = driver_attach(drv); if (error) goto out_del_list; } module_add_driver(drv->owner, drv); error = driver_create_file(drv, &driver_attr_uevent); if (error) { printk(KERN_ERR "%s: uevent attr (%s) failed\n", __func__, drv->name); } error = driver_add_groups(drv, bus->drv_groups); if (error) { /* How the hell do we get out of this pickle? Give up */ printk(KERN_ERR "%s: driver_add_groups(%s) failed\n", __func__, drv->name); } if (!drv->suppress_bind_attrs) { error = add_bind_files(drv); if (error) { /* Ditto */ printk(KERN_ERR "%s: add_bind_files(%s) failed\n", __func__, drv->name); } } return 0; out_del_list: klist_del(&priv->knode_bus); out_unregister: kobject_put(&priv->kobj); /* drv->p is freed in driver_release() */ drv->p = NULL; out_put_bus: bus_put(bus); return error; } /** * bus_remove_driver - delete driver from bus's knowledge. * @drv: driver. * * Detach the driver from the devices it controls, and remove * it from its bus's list of drivers. Finally, we drop the reference * to the bus we took in bus_add_driver(). */ void bus_remove_driver(struct device_driver *drv) { if (!drv->bus) return; if (!drv->suppress_bind_attrs) remove_bind_files(drv); driver_remove_groups(drv, drv->bus->drv_groups); driver_remove_file(drv, &driver_attr_uevent); klist_remove(&drv->p->knode_bus); pr_debug("bus: '%s': remove driver %s\n", drv->bus->name, drv->name); driver_detach(drv); module_remove_driver(drv); kobject_put(&drv->p->kobj); bus_put(drv->bus); } /* Helper for bus_rescan_devices's iter */ static int __must_check bus_rescan_devices_helper(struct device *dev, void *data) { int ret = 0; if (!dev->driver) { if (dev->parent && dev->bus->need_parent_lock) device_lock(dev->parent); ret = device_attach(dev); if (dev->parent && dev->bus->need_parent_lock) device_unlock(dev->parent); } return ret < 0 ? ret : 0; } /** * bus_rescan_devices - rescan devices on the bus for possible drivers * @bus: the bus to scan. * * This function will look for devices on the bus with no driver * attached and rescan it against existing drivers to see if it matches * any by calling device_attach() for the unbound devices. */ int bus_rescan_devices(struct bus_type *bus) { return bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper); } EXPORT_SYMBOL_GPL(bus_rescan_devices); /** * device_reprobe - remove driver for a device and probe for a new driver * @dev: the device to reprobe * * This function detaches the attached driver (if any) for the given * device and restarts the driver probing process. It is intended * to use if probing criteria changed during a devices lifetime and * driver attachment should change accordingly. */ int device_reprobe(struct device *dev) { if (dev->driver) device_driver_detach(dev); return bus_rescan_devices_helper(dev, NULL); } EXPORT_SYMBOL_GPL(device_reprobe); static int bus_add_groups(struct bus_type *bus, const struct attribute_group **groups) { return sysfs_create_groups(&bus->p->subsys.kobj, groups); } static void bus_remove_groups(struct bus_type *bus, const struct attribute_group **groups) { sysfs_remove_groups(&bus->p->subsys.kobj, groups); } static void klist_devices_get(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; get_device(dev); } static void klist_devices_put(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; put_device(dev); } static ssize_t bus_uevent_store(struct bus_type *bus, const char *buf, size_t count) { int rc; rc = kobject_synth_uevent(&bus->p->subsys.kobj, buf, count); return rc ? rc : count; } /* * "open code" the old BUS_ATTR() macro here. We want to use BUS_ATTR_WO() * here, but can not use it as earlier in the file we have * DEVICE_ATTR_WO(uevent), which would cause a clash with the with the store * function name. */ static struct bus_attribute bus_attr_uevent = __ATTR(uevent, 0200, NULL, bus_uevent_store); /** * bus_register - register a driver-core subsystem * @bus: bus to register * * Once we have that, we register the bus with the kobject * infrastructure, then register the children subsystems it has: * the devices and drivers that belong to the subsystem. */ int bus_register(struct bus_type *bus) { int retval; struct subsys_private *priv; struct lock_class_key *key = &bus->lock_key; priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL); if (!priv) return -ENOMEM; priv->bus = bus; bus->p = priv; BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier); retval = kobject_set_name(&priv->subsys.kobj, "%s", bus->name); if (retval) goto out; priv->subsys.kobj.kset = bus_kset; priv->subsys.kobj.ktype = &bus_ktype; priv->drivers_autoprobe = 1; retval = kset_register(&priv->subsys); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_uevent); if (retval) goto bus_uevent_fail; priv->devices_kset = kset_create_and_add("devices", NULL, &priv->subsys.kobj); if (!priv->devices_kset) { retval = -ENOMEM; goto bus_devices_fail; } priv->drivers_kset = kset_create_and_add("drivers", NULL, &priv->subsys.kobj); if (!priv->drivers_kset) { retval = -ENOMEM; goto bus_drivers_fail; } INIT_LIST_HEAD(&priv->interfaces); __mutex_init(&priv->mutex, "subsys mutex", key); klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put); klist_init(&priv->klist_drivers, NULL, NULL); retval = add_probe_files(bus); if (retval) goto bus_probe_files_fail; retval = bus_add_groups(bus, bus->bus_groups); if (retval) goto bus_groups_fail; pr_debug("bus: '%s': registered\n", bus->name); return 0; bus_groups_fail: remove_probe_files(bus); bus_probe_files_fail: kset_unregister(bus->p->drivers_kset); bus_drivers_fail: kset_unregister(bus->p->devices_kset); bus_devices_fail: bus_remove_file(bus, &bus_attr_uevent); bus_uevent_fail: kset_unregister(&bus->p->subsys); /* Above kset_unregister() will kfree @bus->p */ bus->p = NULL; out: kfree(bus->p); bus->p = NULL; return retval; } EXPORT_SYMBOL_GPL(bus_register); /** * bus_unregister - remove a bus from the system * @bus: bus. * * Unregister the child subsystems and the bus itself. * Finally, we call bus_put() to release the refcount */ void bus_unregister(struct bus_type *bus) { pr_debug("bus: '%s': unregistering\n", bus->name); if (bus->dev_root) device_unregister(bus->dev_root); bus_remove_groups(bus, bus->bus_groups); remove_probe_files(bus); kset_unregister(bus->p->drivers_kset); kset_unregister(bus->p->devices_kset); bus_remove_file(bus, &bus_attr_uevent); kset_unregister(&bus->p->subsys); } EXPORT_SYMBOL_GPL(bus_unregister); int bus_register_notifier(struct bus_type *bus, struct notifier_block *nb) { return blocking_notifier_chain_register(&bus->p->bus_notifier, nb); } EXPORT_SYMBOL_GPL(bus_register_notifier); int bus_unregister_notifier(struct bus_type *bus, struct notifier_block *nb) { return blocking_notifier_chain_unregister(&bus->p->bus_notifier, nb); } EXPORT_SYMBOL_GPL(bus_unregister_notifier); struct kset *bus_get_kset(struct bus_type *bus) { return &bus->p->subsys; } EXPORT_SYMBOL_GPL(bus_get_kset); struct klist *bus_get_device_klist(struct bus_type *bus) { return &bus->p->klist_devices; } EXPORT_SYMBOL_GPL(bus_get_device_klist); /* * Yes, this forcibly breaks the klist abstraction temporarily. It * just wants to sort the klist, not change reference counts and * take/drop locks rapidly in the process. It does all this while * holding the lock for the list, so objects can't otherwise be * added/removed while we're swizzling. */ static void device_insertion_sort_klist(struct device *a, struct list_head *list, int (*compare)(const struct device *a, const struct device *b)) { struct klist_node *n; struct device_private *dev_prv; struct device *b; list_for_each_entry(n, list, n_node) { dev_prv = to_device_private_bus(n); b = dev_prv->device; if (compare(a, b) <= 0) { list_move_tail(&a->p->knode_bus.n_node, &b->p->knode_bus.n_node); return; } } list_move_tail(&a->p->knode_bus.n_node, list); } void bus_sort_breadthfirst(struct bus_type *bus, int (*compare)(const struct device *a, const struct device *b)) { LIST_HEAD(sorted_devices); struct klist_node *n, *tmp; struct device_private *dev_prv; struct device *dev; struct klist *device_klist; device_klist = bus_get_device_klist(bus); spin_lock(&device_klist->k_lock); list_for_each_entry_safe(n, tmp, &device_klist->k_list, n_node) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; device_insertion_sort_klist(dev, &sorted_devices, compare); } list_splice(&sorted_devices, &device_klist->k_list); spin_unlock(&device_klist->k_lock); } EXPORT_SYMBOL_GPL(bus_sort_breadthfirst); /** * subsys_dev_iter_init - initialize subsys device iterator * @iter: subsys iterator to initialize * @subsys: the subsys we wanna iterate over * @start: the device to start iterating from, if any * @type: device_type of the devices to iterate over, NULL for all * * Initialize subsys iterator @iter such that it iterates over devices * of @subsys. If @start is set, the list iteration will start there, * otherwise if it is NULL, the iteration starts at the beginning of * the list. */ void subsys_dev_iter_init(struct subsys_dev_iter *iter, struct bus_type *subsys, struct device *start, const struct device_type *type) { struct klist_node *start_knode = NULL; if (start) start_knode = &start->p->knode_bus; klist_iter_init_node(&subsys->p->klist_devices, &iter->ki, start_knode); iter->type = type; } EXPORT_SYMBOL_GPL(subsys_dev_iter_init); /** * subsys_dev_iter_next - iterate to the next device * @iter: subsys iterator to proceed * * Proceed @iter to the next device and return it. Returns NULL if * iteration is complete. * * The returned device is referenced and won't be released till * iterator is proceed to the next device or exited. The caller is * free to do whatever it wants to do with the device including * calling back into subsys code. */ struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter) { struct klist_node *knode; struct device *dev; for (;;) { knode = klist_next(&iter->ki); if (!knode) return NULL; dev = to_device_private_bus(knode)->device; if (!iter->type || iter->type == dev->type) return dev; } } EXPORT_SYMBOL_GPL(subsys_dev_iter_next); /** * subsys_dev_iter_exit - finish iteration * @iter: subsys iterator to finish * * Finish an iteration. Always call this function after iteration is * complete whether the iteration ran till the end or not. */ void subsys_dev_iter_exit(struct subsys_dev_iter *iter) { klist_iter_exit(&iter->ki); } EXPORT_SYMBOL_GPL(subsys_dev_iter_exit); int subsys_interface_register(struct subsys_interface *sif) { struct bus_type *subsys; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return -ENODEV; subsys = bus_get(sif->subsys); if (!subsys) return -EINVAL; mutex_lock(&subsys->p->mutex); list_add_tail(&sif->node, &subsys->p->interfaces); if (sif->add_dev) { subsys_dev_iter_init(&iter, subsys, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->add_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&subsys->p->mutex); return 0; } EXPORT_SYMBOL_GPL(subsys_interface_register); void subsys_interface_unregister(struct subsys_interface *sif) { struct bus_type *subsys; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return; subsys = sif->subsys; mutex_lock(&subsys->p->mutex); list_del_init(&sif->node); if (sif->remove_dev) { subsys_dev_iter_init(&iter, subsys, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->remove_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&subsys->p->mutex); bus_put(subsys); } EXPORT_SYMBOL_GPL(subsys_interface_unregister); static void system_root_device_release(struct device *dev) { kfree(dev); } static int subsys_register(struct bus_type *subsys, const struct attribute_group **groups, struct kobject *parent_of_root) { struct device *dev; int err; err = bus_register(subsys); if (err < 0) return err; dev = kzalloc(sizeof(struct device), GFP_KERNEL); if (!dev) { err = -ENOMEM; goto err_dev; } err = dev_set_name(dev, "%s", subsys->name); if (err < 0) goto err_name; dev->kobj.parent = parent_of_root; dev->groups = groups; dev->release = system_root_device_release; err = device_register(dev); if (err < 0) goto err_dev_reg; subsys->dev_root = dev; return 0; err_dev_reg: put_device(dev); dev = NULL; err_name: kfree(dev); err_dev: bus_unregister(subsys); return err; } /** * subsys_system_register - register a subsystem at /sys/devices/system/ * @subsys: system subsystem * @groups: default attributes for the root device * * All 'system' subsystems have a /sys/devices/system/<name> root device * with the name of the subsystem. The root device can carry subsystem- * wide attributes. All registered devices are below this single root * device and are named after the subsystem with a simple enumeration * number appended. The registered devices are not explicitly named; * only 'id' in the device needs to be set. * * Do not use this interface for anything new, it exists for compatibility * with bad ideas only. New subsystems should use plain subsystems; and * add the subsystem-wide attributes should be added to the subsystem * directory itself and not some create fake root-device placed in * /sys/devices/system/<name>. */ int subsys_system_register(struct bus_type *subsys, const struct attribute_group **groups) { return subsys_register(subsys, groups, &system_kset->kobj); } EXPORT_SYMBOL_GPL(subsys_system_register); /** * subsys_virtual_register - register a subsystem at /sys/devices/virtual/ * @subsys: virtual subsystem * @groups: default attributes for the root device * * All 'virtual' subsystems have a /sys/devices/system/<name> root device * with the name of the subystem. The root device can carry subsystem-wide * attributes. All registered devices are below this single root device. * There's no restriction on device naming. This is for kernel software * constructs which need sysfs interface. */ int subsys_virtual_register(struct bus_type *subsys, const struct attribute_group **groups) { struct kobject *virtual_dir; virtual_dir = virtual_device_parent(NULL); if (!virtual_dir) return -ENOMEM; return subsys_register(subsys, groups, virtual_dir); } EXPORT_SYMBOL_GPL(subsys_virtual_register); int __init buses_init(void) { bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL); if (!bus_kset) return -ENOMEM; system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj); if (!system_kset) return -ENOMEM; return 0; }
3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> */ #include <linux/mm.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/workqueue.h> #include <linux/static_key.h> #include <net/tcp.h> #include <net/inet_common.h> #include <net/xfrm.h> #include <net/busy_poll.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { if (seq == s_win) return true; if (after(end_seq, s_win) && before(seq, e_win)) return true; return seq == e_win && seq == end_seq; } static enum tcp_tw_status tcp_timewait_check_oow_rate_limit(struct inet_timewait_sock *tw, const struct sk_buff *skb, int mib_idx) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); if (!tcp_oow_rate_limited(twsk_net(tw), skb, mib_idx, &tcptw->tw_last_oow_ack_time)) { /* Send ACK. Note, we do not put the bucket, * it will be released by caller. */ return TCP_TW_ACK; } /* We are rate-limiting, so just release the tw sock and drop skb. */ inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* * * Main purpose of TIME-WAIT state is to close connection gracefully, * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN * (and, probably, tail of data) and one or more our ACKs are lost. * * What is TIME-WAIT timeout? It is associated with maximal packet * lifetime in the internet, which results in wrong conclusion, that * it is set to catch "old duplicate segments" wandering out of their path. * It is not quite correct. This timeout is calculated so that it exceeds * maximal retransmission timeout enough to allow to lose one (or more) * segments sent by peer and our ACKs. This time may be calculated from RTO. * * When TIME-WAIT socket receives RST, it means that another end * finally closed and we are allowed to kill TIME-WAIT too. * * Second purpose of TIME-WAIT is catching old duplicate segments. * Well, certainly it is pure paranoia, but if we load TIME-WAIT * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. * * If we invented some more clever way to catch duplicates * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. * * The algorithm below is based on FORMAL INTERPRETATION of RFCs. * When you compare it to RFCs, please, read section SEGMENT ARRIVES * from the very beginning. * * NOTE. With recycling (and later with fin-wait-2) TW bucket * is _not_ stateless. It means, that strictly speaking we must * spinlock it. I do not want! Well, probability of misbehaviour * is ridiculously low and, seems, we could use some mb() tricks * to avoid misread sequence numbers, states etc. --ANK * * We don't need to initialize tmp_out.sack_ok as we don't use the results */ enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th) { struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { tcp_parse_options(twsk_net(tw), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcptw->tw_ts_offset; tmp_opt.ts_recent = tcptw->tw_ts_recent; tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } if (tw->tw_substate == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt, tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd)) return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDFINWAIT2); if (th->rst) goto kill; if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt)) return TCP_TW_RST; /* Dup ACK? */ if (!th->ack || !after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) || TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* New data or FIN. If new data arrive after half-duplex close, * reset. */ if (!th->fin || TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) return TCP_TW_RST; /* FIN arrived, enter true time-wait state. */ tw->tw_substate = TCP_TIME_WAIT; tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent_stamp = ktime_get_seconds(); tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } /* * Now real TIME-WAIT state. * * RFC 1122: * "When a connection is [...] on TIME-WAIT state [...] * [a TCP] MAY accept a new SYN from the remote TCP to * reopen the connection directly, if it: * * (1) assigns its initial sequence number for the new * connection to be larger than the largest sequence * number it used on the previous connection incarnation, * and * * (2) returns to TIME-WAIT state if the SYN turns out * to be an old duplicate". */ if (!paws_reject && (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt && (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this * protocol bug yet. */ if (!READ_ONCE(twsk_net(tw)->ipv4.sysctl_tcp_rfc1337)) { kill: inet_twsk_deschedule_put(tw); return TCP_TW_SUCCESS; } } else { inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); } if (tmp_opt.saw_tstamp) { tcptw->tw_ts_recent = tmp_opt.rcv_tsval; tcptw->tw_ts_recent_stamp = ktime_get_seconds(); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } /* Out of window segment. All the segments are ACKed immediately. The only exception is new SYN. We accept it, if it is not old duplicate and we are not in danger to be killed by delayed old duplicates. RFC check is that it has newer sequence number works at rates <40Mbit/sec. However, if paws works, it is reliable AND even more, we even may relax silly seq space cutoff. RED-PEN: we violate main RFC requirement, if this SYN will appear old duplicate (i.e. we receive RST in reply to SYN-ACK), we must return socket to time-wait state. It is not good, but not fatal yet. */ if (th->syn && !th->rst && !th->ack && !paws_reject && (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) || (tmp_opt.saw_tstamp && (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { u32 isn = tcptw->tw_snd_nxt + 65535 + 2; if (isn == 0) isn++; TCP_SKB_CB(skb)->tcp_tw_isn = isn; return TCP_TW_SYN; } if (paws_reject) __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWSESTABREJECTED); if (!th->rst) { /* In this case we must reset the TIMEWAIT timer. * * If it is ACKless SYN it may be both old duplicate * and new good SYN with random sequence number <rcv_nxt. * Do not reschedule in the last case. */ if (paws_reject || th->ack) inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } inet_twsk_put(tw); return TCP_TW_SUCCESS; } EXPORT_SYMBOL(tcp_timewait_state_process); /* * Move a socket to time-wait or dead fin-wait-2 state. */ void tcp_time_wait(struct sock *sk, int state, int timeo) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct inet_timewait_sock *tw; struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; tw = inet_twsk_alloc(sk, tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); struct inet_sock *inet = inet_sk(sk); tw->tw_transparent = inet->transparent; tw->tw_mark = sk->sk_mark; tw->tw_priority = sk->sk_priority; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_rcv_wnd = tcp_receive_window(tp); tcptw->tw_ts_recent = tp->rx_opt.ts_recent; tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; tcptw->tw_ts_offset = tp->tsoffset; tcptw->tw_last_oow_ack_time = 0; tcptw->tw_tx_delay = tp->tcp_tx_delay; #if IS_ENABLED(CONFIG_IPV6) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); tw->tw_v6_daddr = sk->sk_v6_daddr; tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr; tw->tw_tclass = np->tclass; tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK); tw->tw_txhash = sk->sk_txhash; tw->tw_ipv6only = sk->sk_ipv6only; } #endif #ifdef CONFIG_TCP_MD5SIG /* * The timewait bucket does not have the key DB from the * sock structure. We just make a quick copy of the * md5 key being used (if indeed we are using one) * so the timewait ack generating code has the key. */ do { tcptw->tw_md5_key = NULL; if (static_branch_unlikely(&tcp_md5_needed)) { struct tcp_md5sig_key *key; key = tp->af_specific->md5_lookup(sk, sk); if (key) { tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC); BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool()); } } } while (0); #endif /* Get the TIME_WAIT timeout firing. */ if (timeo < rto) timeo = rto; if (state == TCP_TIME_WAIT) timeo = TCP_TIMEWAIT_LEN; /* tw_timer is pinned, so we need to make sure BH are disabled * in following section, otherwise timer handler could run before * we complete the initialization. */ local_bh_disable(); inet_twsk_schedule(tw, timeo); /* Linkage updates. * Note that access to tw after this point is illegal. */ inet_twsk_hashdance(tw, sk, &tcp_hashinfo); local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); tcp_done(sk); } EXPORT_SYMBOL(tcp_time_wait); void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); if (twsk->tw_md5_key) kfree_rcu(twsk->tw_md5_key, rcu); } #endif } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); /* Warning : This function is called without sk_listener being locked. * Be sure to read socket fields once, as their value could change under us. */ void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst) { struct inet_request_sock *ireq = inet_rsk(req); const struct tcp_sock *tp = tcp_sk(sk_listener); int full_space = tcp_full_space(sk_listener); u32 window_clamp; __u8 rcv_wscale; u32 rcv_wnd; int mss; mss = tcp_mss_clamp(tp, dst_metric_advmss(dst)); window_clamp = READ_ONCE(tp->window_clamp); /* Set this up on the first call only */ req->rsk_window_clamp = window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* limit the window selection if the user enforce a smaller rx buffer */ if (sk_listener->sk_userlocks & SOCK_RCVBUF_LOCK && (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) req->rsk_window_clamp = full_space; rcv_wnd = tcp_rwnd_init_bpf((struct sock *)req); if (rcv_wnd == 0) rcv_wnd = dst_metric(dst, RTAX_INITRWND); else if (full_space < rcv_wnd * mss) full_space = rcv_wnd * mss; /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(sk_listener, full_space, mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, rcv_wnd); ireq->rcv_wscale = rcv_wscale; } EXPORT_SYMBOL(tcp_openreq_init_rwin); static void tcp_ecn_openreq_child(struct tcp_sock *tp, const struct request_sock *req) { tp->ecn_flags = inet_rsk(req)->ecn_ok ? TCP_ECN_OK : 0; } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) { struct inet_connection_sock *icsk = inet_csk(sk); u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; if (ca_key != TCP_CA_UNSPEC) { const struct tcp_congestion_ops *ca; rcu_read_lock(); ca = tcp_ca_find_key(ca_key); if (likely(ca && bpf_try_module_get(ca, ca->owner))) { icsk->icsk_ca_dst_locked = tcp_ca_dst_locked(dst); icsk->icsk_ca_ops = ca; ca_got_dst = true; } rcu_read_unlock(); } /* If no valid choice made yet, assign current system default ca. */ if (!ca_got_dst && (!icsk->icsk_ca_setsockopt || !bpf_try_module_get(icsk->icsk_ca_ops, icsk->icsk_ca_ops->owner))) tcp_assign_congestion_control(sk); tcp_set_ca_state(sk, TCP_CA_Open); } EXPORT_SYMBOL_GPL(tcp_ca_openreq_child); static void smc_check_reset_syn_req(struct tcp_sock *oldtp, struct request_sock *req, struct tcp_sock *newtp) { #if IS_ENABLED(CONFIG_SMC) struct inet_request_sock *ireq; if (static_branch_unlikely(&tcp_have_smc)) { ireq = inet_rsk(req); if (oldtp->syn_smc && !ireq->smc_ok) newtp->syn_smc = 0; } #endif } /* This is not only more efficient than what we used to do, it eliminates * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM * * Actually, we could lots of memory writes here. tp of listening * socket contains all necessary default parameters. */ struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb) { struct sock *newsk = inet_csk_clone_lock(sk, req, GFP_ATOMIC); const struct inet_request_sock *ireq = inet_rsk(req); struct tcp_request_sock *treq = tcp_rsk(req); struct inet_connection_sock *newicsk; struct tcp_sock *oldtp, *newtp; u32 seq; if (!newsk) return NULL; newicsk = inet_csk(newsk); newtp = tcp_sk(newsk); oldtp = tcp_sk(sk); smc_check_reset_syn_req(oldtp, req, newtp); /* Now setup tcp_sock */ newtp->pred_flags = 0; seq = treq->rcv_isn + 1; newtp->rcv_wup = seq; WRITE_ONCE(newtp->copied_seq, seq); WRITE_ONCE(newtp->rcv_nxt, seq); newtp->segs_in = 1; seq = treq->snt_isn + 1; newtp->snd_sml = newtp->snd_una = seq; WRITE_ONCE(newtp->snd_nxt, seq); newtp->snd_up = seq; INIT_LIST_HEAD(&newtp->tsq_node); INIT_LIST_HEAD(&newtp->tsorted_sent_queue); tcp_init_wl(newtp, treq->rcv_isn); minmax_reset(&newtp->rtt_min, tcp_jiffies32, ~0U); newicsk->icsk_ack.lrcvtime = tcp_jiffies32; newtp->lsndtime = tcp_jiffies32; newsk->sk_txhash = treq->txhash; newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); WRITE_ONCE(newtp->write_seq, newtp->pushed_seq = treq->snt_isn + 1); if (sock_flag(newsk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(newsk, keepalive_time_when(newtp)); newtp->rx_opt.tstamp_ok = ireq->tstamp_ok; newtp->rx_opt.sack_ok = ireq->sack_ok; newtp->window_clamp = req->rsk_window_clamp; newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; } else { newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; newtp->window_clamp = min(newtp->window_clamp, 65535U); } newtp->snd_wnd = ntohs(tcp_hdr(skb)->window) << newtp->rx_opt.snd_wscale; newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent); newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { newtp->rx_opt.ts_recent_stamp = 0; newtp->tcp_header_len = sizeof(struct tcphdr); } if (req->num_timeout) { newtp->undo_marker = treq->snt_isn; newtp->retrans_stamp = div_u64(treq->snt_synack, USEC_PER_SEC / TCP_TS_HZ); } newtp->tsoffset = treq->ts_off; #ifdef CONFIG_TCP_MD5SIG newtp->md5sig_info = NULL; /*XXX*/ if (treq->af_specific->req_md5_lookup(sk, req_to_sk(req))) newtp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED; #endif if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); return newsk; } EXPORT_SYMBOL(tcp_create_openreq_child); /* * Process an incoming packet for SYN_RECV sockets represented as a * request_sock. Normally sk is the listener socket but for TFO it * points to the child socket. * * XXX (TFO) - The current impl contains a special check for ack * validation and inside tcp_v4_reqsk_send_ack(). Can we do better? * * We don't need to initialize tmp_opt.sack_ok as we don't use the results * * Note: If @fastopen is true, this can be called from process context. * Otherwise, this is from BH context. */ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *req_stolen) { struct tcp_options_received tmp_opt; struct sock *child; const struct tcphdr *th = tcp_hdr(skb); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); bool paws_reject = false; bool own_req; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { tmp_opt.ts_recent = READ_ONCE(req->ts_recent); if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; /* We do not store true stamp, but it is not required, * it can be estimated (approximately) * from another data. */ tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } /* Check for pure retransmitted SYN. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn && flg == TCP_FLAG_SYN && !paws_reject) { /* * RFC793 draws (Incorrectly! It was fixed in RFC1122) * this case on figure 6 and figure 8, but formal * protocol description says NOTHING. * To be more exact, it says that we should send ACK, * because this segment (at least, if it has no data) * is out of window. * * CONCLUSION: RFC793 (even with RFC1122) DOES NOT * describe SYN-RECV state. All the description * is wrong, we cannot believe to it and should * rely only on common sense and implementation * experience. * * Enforce "SYN-ACK" according to figure 8, figure 6 * of RFC793, fixed by RFC1122. * * Note that even if there is new data in the SYN packet * they will be thrown away too. * * Reset timer after retransmitting SYNACK, similar to * the idea of fast retransmit in recovery. */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time) && !inet_rtx_syn_ack(sk, req)) { unsigned long expires = jiffies; expires += reqsk_timeout(req, TCP_RTO_MAX); if (!fastopen) mod_timer_pending(&req->rsk_timer, expires); else req->rsk_timer.expires = expires; } return NULL; } /* Further reproduces section "SEGMENT ARRIVES" for state SYN-RECEIVED of RFC793. It is broken, however, it does not work only when SYNs are crossed. You would think that SYN crossing is impossible here, since we should have a SYN_SENT socket (from connect()) on our end, but this is not true if the crossed SYNs were sent to both ends by a malicious third party. We must defend against this, and to do that we first verify the ACK (as per RFC793, page 36) and reset if it is invalid. Is this a true full defense? To convince ourselves, let us consider a way in which the ACK test can still pass in this 'malicious crossed SYNs' case. Malicious sender sends identical SYNs (and thus identical sequence numbers) to both A and B: A: gets SYN, seq=7 B: gets SYN, seq=7 By our good fortune, both A and B select the same initial send sequence number of seven :-) A: sends SYN|ACK, seq=7, ack_seq=8 B: sends SYN|ACK, seq=7, ack_seq=8 So we are now A eating this SYN|ACK, ACK test passes. So does sequence test, SYN is truncated, and thus we consider it a bare ACK. If icsk->icsk_accept_queue.rskq_defer_accept, we silently drop this bare ACK. Otherwise, we create an established connection. Both ends (listening sockets) accept the new incoming connection and try to talk to each other. 8-) Note: This case is both harmless, and rare. Possibility is about the same as us discovering intelligent life on another plant tomorrow. But generally, we should (RFC lies!) to accept ACK from SYNACK both here and in tcp_rcv_state_process(). tcp_rcv_state_process() does not, hence, we do not too. Note that the case is absolutely generic: we cannot optimize anything here without violating protocol. All the checks must be made before attempt to create socket. */ /* RFC793 page 36: "If the connection is in any non-synchronized state ... * and the incoming segment acknowledges something not yet * sent (the segment carries an unacceptable ACK) ... * a reset is sent." * * Invalid ACK: reset will be sent by listening socket. * Note that the ACK validity check for a Fast Open socket is done * elsewhere and is checked directly against the child socket rather * than req because user data may have been sent out. */ if ((flg & TCP_FLAG_ACK) && !fastopen && (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1)) return sk; /* Also, it would be not so bad idea to check rcv_tsecr, which * is essentially ACK extension and too early or too late values * should cause reset in unsynchronized states. */ /* RFC793: "first check sequence number". */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, tcp_rsk(req)->rcv_nxt, tcp_rsk(req)->rcv_nxt + req->rsk_rcv_wnd)) { /* Out of window: send ACK and drop. */ if (!(flg & TCP_FLAG_RST) && !tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, &tcp_rsk(req)->last_oow_ack_time)) req->rsk_ops->send_ack(sk, skb, req); if (paws_reject) NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED); return NULL; } /* In sequence, PAWS is OK. */ if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting at tcp_rsk(req)->rcv_isn + 1. */ flg &= ~TCP_FLAG_SYN; } /* RFC793: "second check the RST bit" and * "fourth, check the SYN bit" */ if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) { TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); goto embryonic_reset; } /* ACK sequence verified above, just make sure ACK is * set. If ACK not set, just silently drop the packet. * * XXX (TFO) - if we ever allow "data after SYN", the * following check needs to be removed. */ if (!(flg & TCP_FLAG_ACK)) return NULL; /* For Fast Open no more processing is needed (sk is the * child socket). */ if (fastopen) return sk; /* While TCP_DEFER_ACCEPT is active, drop bare ACK. */ if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) { inet_rsk(req)->acked = 1; __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP); return NULL; } /* OK, ACK is valid, create big socket and * feed this segment to it. It will repeat all * the tests. THIS SEGMENT MUST MOVE SOCKET TO * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL, req, &own_req); if (!child) goto listen_overflow; if (own_req && tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) tcp_sk(child)->rx_opt.ts_recent = tmp_opt.rcv_tsval; if (own_req && rsk_drop_req(req)) { reqsk_queue_removed(&inet_csk(req->rsk_listener)->icsk_accept_queue, req); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); return child; } sock_rps_save_rxhash(child, skb); tcp_synack_rtt_meas(child, req); *req_stolen = !own_req; return inet_csk_complete_hashdance(sk, child, req, own_req); listen_overflow: if (sk != req->rsk_listener) __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMIGRATEREQFAILURE); if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_abort_on_overflow)) { inet_rsk(req)->acked = 1; return NULL; } embryonic_reset: if (!(flg & TCP_FLAG_RST)) { /* Received a bad SYN pkt - for TFO We try not to reset * the local connection unless it's really necessary to * avoid becoming vulnerable to outside attack aiming at * resetting legit local connections. */ req->rsk_ops->send_reset(sk, skb); } else if (fastopen) { /* received a valid RST pkt */ reqsk_fastopen_remove(sk, req, true); tcp_reset(sk, skb); } if (!fastopen) { bool unlinked = inet_csk_reqsk_queue_drop(sk, req); if (unlinked) __NET_INC_STATS(sock_net(sk), LINUX_MIB_EMBRYONICRSTS); *req_stolen = !unlinked; } return NULL; } EXPORT_SYMBOL(tcp_check_req); /* * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with * the new socket. * * For the vast majority of cases child->sk_state will be TCP_SYN_RECV * when entering. But other states are possible due to a race condition * where after __inet_lookup_established() fails but before the listener * locked is obtained, other packets cause the same connection to * be created. */ int tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb) __releases(&((child)->sk_lock.slock)) { int ret = 0; int state = child->sk_state; /* record NAPI ID of child */ sk_mark_napi_id(child, skb); tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb); /* Wakeup parent, send SIGIO */ if (state == TCP_SYN_RECV && child->sk_state != state) parent->sk_data_ready(parent); } else { /* Alas, it is possible again, because we do lookup * in main socket hash table and lock on listening * socket does not protect us more. */ __sk_add_backlog(child, skb); } bh_unlock_sock(child); sock_put(child); return ret; } EXPORT_SYMBOL(tcp_child_process);
5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/xattr.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> * * Fix by Harrison Xing <harrison@mountainviewdata.com>. * Ext4 code with a lot of help from Eric Jarman <ejarman@acm.org>. * Extended attributes for symlinks and special files added per * suggestion of Luka Renko <luka.renko@hermes.si>. * xattr consolidation Copyright (c) 2004 James Morris <jmorris@redhat.com>, * Red Hat Inc. * ea-in-inode support by Alex Tomas <alex@clusterfs.com> aka bzzz * and Andreas Gruenbacher <agruen@suse.de>. */ /* * Extended attributes are stored directly in inodes (on file systems with * inodes bigger than 128 bytes) and on additional disk blocks. The i_file_acl * field contains the block number if an inode uses an additional block. All * attributes must fit in the inode and one additional block. Blocks that * contain the identical set of attributes may be shared among several inodes. * Identical blocks are detected by keeping a cache of blocks that have * recently been accessed. * * The attributes in inodes and on blocks have a different header; the entries * are stored in the same format: * * +------------------+ * | header | * | entry 1 | | * | entry 2 | | growing downwards * | entry 3 | v * | four null bytes | * | . . . | * | value 1 | ^ * | value 3 | | growing upwards * | value 2 | | * +------------------+ * * The header is followed by multiple entry descriptors. In disk blocks, the * entry descriptors are kept sorted. In inodes, they are unsorted. The * attribute values are aligned to the end of the block in no specific order. * * Locking strategy * ---------------- * EXT4_I(inode)->i_file_acl is protected by EXT4_I(inode)->xattr_sem. * EA blocks are only changed if they are exclusive to an inode, so * holding xattr_sem also means that nothing but the EA block's reference * count can change. Multiple writers to the same block are synchronized * by the buffer lock. */ #include <linux/init.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/mbcache.h> #include <linux/quotaops.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "acl.h" #ifdef EXT4_XATTR_DEBUG # define ea_idebug(inode, fmt, ...) \ printk(KERN_DEBUG "inode %s:%lu: " fmt "\n", \ inode->i_sb->s_id, inode->i_ino, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) \ printk(KERN_DEBUG "block %pg:%lu: " fmt "\n", \ bh->b_bdev, (unsigned long)bh->b_blocknr, ##__VA_ARGS__) #else # define ea_idebug(inode, fmt, ...) no_printk(fmt, ##__VA_ARGS__) # define ea_bdebug(bh, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif static void ext4_xattr_block_cache_insert(struct mb_cache *, struct buffer_head *); static struct buffer_head * ext4_xattr_block_cache_find(struct inode *, struct ext4_xattr_header *, struct mb_cache_entry **); static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, size_t value_count); static void ext4_xattr_rehash(struct ext4_xattr_header *); static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &posix_acl_default_xattr_handler, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, #endif [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, }; const struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL &posix_acl_access_xattr_handler, &posix_acl_default_xattr_handler, #endif #ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif &ext4_xattr_hurd_handler, NULL }; #define EA_BLOCK_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_block_cache) #define EA_INODE_CACHE(inode) (((struct ext4_sb_info *) \ inode->i_sb->s_fs_info)->s_ea_inode_cache) static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode); #ifdef CONFIG_LOCKDEP void ext4_xattr_inode_set_class(struct inode *ea_inode) { struct ext4_inode_info *ei = EXT4_I(ea_inode); lockdep_set_subclass(&ea_inode->i_rwsem, 1); (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */ lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA); } #endif static __le32 ext4_xattr_block_csum(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le64 dsk_block_nr = cpu_to_le64(block_nr); __u32 dummy_csum = 0; int offset = offsetof(struct ext4_xattr_header, h_checksum); csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, EXT4_BLOCK_SIZE(inode->i_sb) - offset); return cpu_to_le32(csum); } static int ext4_xattr_block_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_xattr_header *hdr = BHDR(bh); int ret = 1; if (ext4_has_metadata_csum(inode->i_sb)) { lock_buffer(bh); ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, bh->b_blocknr, hdr)); unlock_buffer(bh); } return ret; } static void ext4_xattr_block_csum_set(struct inode *inode, struct buffer_head *bh) { if (ext4_has_metadata_csum(inode->i_sb)) BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, bh->b_blocknr, BHDR(bh)); } static inline const struct xattr_handler * ext4_xattr_handler(int name_index) { const struct xattr_handler *handler = NULL; if (name_index > 0 && name_index < ARRAY_SIZE(ext4_xattr_handler_map)) handler = ext4_xattr_handler_map[name_index]; return handler; } static int ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, void *value_start) { struct ext4_xattr_entry *e = entry; /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) return -EFSCORRUPTED; if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) return -EFSCORRUPTED; e = next; } /* Check the values */ while (!IS_LAST_ENTRY(entry)) { u32 size = le32_to_cpu(entry->e_value_size); if (size > EXT4_XATTR_SIZE_MAX) return -EFSCORRUPTED; if (size != 0 && entry->e_value_inum == 0) { u16 offs = le16_to_cpu(entry->e_value_offs); void *value; /* * The value cannot overlap the names, and the value * with padding cannot extend beyond 'end'. Check both * the padded and unpadded sizes, since the size may * overflow to 0 when adding padding. */ if (offs > end - value_start) return -EFSCORRUPTED; value = value_start + offs; if (value < (void *)e + sizeof(u32) || size > end - value || EXT4_XATTR_SIZE(size) > end - value) return -EFSCORRUPTED; } entry = EXT4_XATTR_NEXT(entry); } return 0; } static inline int __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, const char *function, unsigned int line) { int error = -EFSCORRUPTED; if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) goto errout; if (buffer_verified(bh)) return 0; error = -EFSBADCRC; if (!ext4_xattr_block_csum_verify(inode, bh)) goto errout; error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, bh->b_data); errout: if (error) __ext4_error_inode(inode, function, line, 0, -error, "corrupted xattr block %llu", (unsigned long long) bh->b_blocknr); else set_buffer_verified(bh); return error; } #define ext4_xattr_check_block(inode, bh) \ __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) static int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { int error = -EFSCORRUPTED; if (end - (void *)header < sizeof(*header) + sizeof(u32) || (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) goto errout; error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); errout: if (error) __ext4_error_inode(inode, function, line, 0, -error, "corrupted in-inode xattr"); return error; } #define xattr_check_inode(inode, header, end) \ __xattr_check_inode((inode), (header), (end), __func__, __LINE__) static int xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, void *end, int name_index, const char *name, int sorted) { struct ext4_xattr_entry *entry, *next; size_t name_len; int cmp = 1; if (name == NULL) return -EINVAL; name_len = strlen(name); for (entry = *pentry; !IS_LAST_ENTRY(entry); entry = next) { next = EXT4_XATTR_NEXT(entry); if ((void *) next >= end) { EXT4_ERROR_INODE(inode, "corrupted xattr entries"); return -EFSCORRUPTED; } cmp = name_index - entry->e_name_index; if (!cmp) cmp = name_len - entry->e_name_len; if (!cmp) cmp = memcmp(name, entry->e_name, name_len); if (cmp <= 0 && (sorted || cmp == 0)) break; } *pentry = entry; return cmp ? -ENODATA : 0; } static u32 ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) { return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); } static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { return ((u64)ea_inode->i_ctime.tv_sec << 32) | (u32) inode_peek_iversion_raw(ea_inode); } static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) { ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32); inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff); } static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) { return (u32)ea_inode->i_atime.tv_sec; } static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) { ea_inode->i_atime.tv_sec = hash; } /* * Read the EA value from an inode. */ static int ext4_xattr_inode_read(struct inode *ea_inode, void *buf, size_t size) { int blocksize = 1 << ea_inode->i_blkbits; int bh_count = (size + blocksize - 1) >> ea_inode->i_blkbits; int tail_size = (size % blocksize) ?: blocksize; struct buffer_head *bhs_inline[8]; struct buffer_head **bhs = bhs_inline; int i, ret; if (bh_count > ARRAY_SIZE(bhs_inline)) { bhs = kmalloc_array(bh_count, sizeof(*bhs), GFP_NOFS); if (!bhs) return -ENOMEM; } ret = ext4_bread_batch(ea_inode, 0 /* block */, bh_count, true /* wait */, bhs); if (ret) goto free_bhs; for (i = 0; i < bh_count; i++) { /* There shouldn't be any holes in ea_inode. */ if (!bhs[i]) { ret = -EFSCORRUPTED; goto put_bhs; } memcpy((char *)buf + blocksize * i, bhs[i]->b_data, i < bh_count - 1 ? blocksize : tail_size); } ret = 0; put_bhs: for (i = 0; i < bh_count; i++) brelse(bhs[i]); free_bhs: if (bhs != bhs_inline) kfree(bhs); return ret; } #define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, u32 ea_inode_hash, struct inode **ea_inode) { struct inode *inode; int err; /* * We have to check for this corruption early as otherwise * iget_locked() could wait indefinitely for the state of our * parent inode. */ if (parent->i_ino == ea_ino) { ext4_error(parent->i_sb, "Parent and EA inode have the same ino %lu", ea_ino); return -EFSCORRUPTED; } inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE); if (IS_ERR(inode)) { err = PTR_ERR(inode); ext4_error(parent->i_sb, "error while reading EA inode %lu err=%d", ea_ino, err); return err; } ext4_xattr_inode_set_class(inode); /* * Check whether this is an old Lustre-style xattr inode. Lustre * implementation does not have hash validation, rather it has a * backpointer from ea_inode to the parent inode. */ if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) && EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino && inode->i_generation == parent->i_generation) { ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); ext4_xattr_inode_set_ref(inode, 1); } else { inode_lock_nested(inode, I_MUTEX_XATTR); inode->i_flags |= S_NOQUOTA; inode_unlock(inode); } *ea_inode = inode; return 0; } /* Remove entry from mbcache when EA inode is getting evicted */ void ext4_evict_ea_inode(struct inode *inode) { struct mb_cache_entry *oe; if (!EA_INODE_CACHE(inode)) return; /* Wait for entry to get unused so that we can remove it */ while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode), ext4_xattr_inode_get_hash(inode), inode->i_ino))) { mb_cache_entry_wait_unused(oe); mb_cache_entry_put(EA_INODE_CACHE(inode), oe); } } static int ext4_xattr_inode_verify_hashes(struct inode *ea_inode, struct ext4_xattr_entry *entry, void *buffer, size_t size) { u32 hash; /* Verify stored hash matches calculated hash. */ hash = ext4_xattr_inode_hash(EXT4_SB(ea_inode->i_sb), buffer, size); if (hash != ext4_xattr_inode_get_hash(ea_inode)) return -EFSCORRUPTED; if (entry) { __le32 e_hash, tmp_data; /* Verify entry hash. */ tmp_data = cpu_to_le32(hash); e_hash = ext4_xattr_hash_entry(entry->e_name, entry->e_name_len, &tmp_data, 1); if (e_hash != entry->e_hash) return -EFSCORRUPTED; } return 0; } /* * Read xattr value from the EA inode. */ static int ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, void *buffer, size_t size) { struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); struct inode *ea_inode; int err; err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ea_inode = NULL; goto out; } if (i_size_read(ea_inode) != size) { ext4_warning_inode(ea_inode, "ea_inode file size=%llu entry size=%zu", i_size_read(ea_inode), size); err = -EFSCORRUPTED; goto out; } err = ext4_xattr_inode_read(ea_inode, buffer, size); if (err) goto out; if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) { err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size); if (err) { ext4_warning_inode(ea_inode, "EA inode hash validation failed"); goto out; } if (ea_inode_cache) mb_cache_entry_create(ea_inode_cache, GFP_NOFS, ext4_xattr_inode_get_hash(ea_inode), ea_inode->i_ino, true /* reusable */); } out: iput(ea_inode); return err; } static int ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { struct buffer_head *bh = NULL; struct ext4_xattr_entry *entry; size_t size; void *end; int error; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld", name_index, name, buffer, (long)buffer_size); if (!EXT4_I(inode)->i_file_acl) return -ENODATA; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(ea_block_cache, bh); entry = BFIRST(bh); end = bh->b_data + bh->b_size; error = xattr_find_entry(inode, &entry, end, name_index, name, 1); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); error = -ERANGE; if (unlikely(size > EXT4_XATTR_SIZE_MAX)) goto cleanup; if (buffer) { if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { error = ext4_xattr_inode_get(inode, entry, buffer, size); if (error) goto cleanup; } else { u16 offset = le16_to_cpu(entry->e_value_offs); void *p = bh->b_data + offset; if (unlikely(p + size > end)) goto cleanup; memcpy(buffer, p, size); } } error = size; cleanup: brelse(bh); return error; } int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; struct ext4_inode *raw_inode; struct ext4_iloc iloc; size_t size; void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return -ENODATA; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; error = xattr_check_inode(inode, header, end); if (error) goto cleanup; entry = IFIRST(header); error = xattr_find_entry(inode, &entry, end, name_index, name, 0); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); error = -ERANGE; if (unlikely(size > EXT4_XATTR_SIZE_MAX)) goto cleanup; if (buffer) { if (size > buffer_size) goto cleanup; if (entry->e_value_inum) { error = ext4_xattr_inode_get(inode, entry, buffer, size); if (error) goto cleanup; } else { u16 offset = le16_to_cpu(entry->e_value_offs); void *p = (void *)IFIRST(header) + offset; if (unlikely(p + size > end)) goto cleanup; memcpy(buffer, p, size); } } error = size; cleanup: brelse(iloc.bh); return error; } /* * ext4_xattr_get() * * Copy an extended attribute into the buffer * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * * Returns a negative error number on failure, or the number of bytes * used / required on success. */ int ext4_xattr_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size) { int error; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; if (strlen(name) > 255) return -ERANGE; down_read(&EXT4_I(inode)->xattr_sem); error = ext4_xattr_ibody_get(inode, name_index, name, buffer, buffer_size); if (error == -ENODATA) error = ext4_xattr_block_get(inode, name_index, name, buffer, buffer_size); up_read(&EXT4_I(inode)->xattr_sem); return error; } static int ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, char *buffer, size_t buffer_size) { size_t rest = buffer_size; for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { const struct xattr_handler *handler = ext4_xattr_handler(entry->e_name_index); if (handler && (!handler->list || handler->list(dentry))) { const char *prefix = handler->prefix ?: handler->name; size_t prefix_len = strlen(prefix); size_t size = prefix_len + entry->e_name_len + 1; if (buffer) { if (size > rest) return -ERANGE; memcpy(buffer, prefix, prefix_len); buffer += prefix_len; memcpy(buffer, entry->e_name, entry->e_name_len); buffer += entry->e_name_len; *buffer++ = 0; } rest -= size; } } return buffer_size - rest; /* total size */ } static int ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; ea_idebug(inode, "buffer=%p, buffer_size=%ld", buffer, (long)buffer_size); if (!EXT4_I(inode)->i_file_acl) return 0; ea_idebug(inode, "reading block %llu", (unsigned long long)EXT4_I(inode)->i_file_acl); bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return PTR_ERR(bh); ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; ext4_xattr_block_cache_insert(EA_BLOCK_CACHE(inode), bh); error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size); cleanup: brelse(bh); return error; } static int ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { struct inode *inode = d_inode(dentry); struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) return 0; error = ext4_get_inode_loc(inode, &iloc); if (error) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; error = xattr_check_inode(inode, header, end); if (error) goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); cleanup: brelse(iloc.bh); return error; } /* * Inode operation listxattr() * * d_inode(dentry)->i_rwsem: don't care * * Copy a list of attribute names into the buffer * provided, or compute the buffer size required. * Buffer is NULL to compute the size of the buffer required. * * Returns a negative error number on failure, or the number of bytes * used / required on success. */ ssize_t ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret, ret2; down_read(&EXT4_I(d_inode(dentry))->xattr_sem); ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; if (buffer) { buffer += ret; buffer_size -= ret; } ret = ext4_xattr_block_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; ret += ret2; errout: up_read(&EXT4_I(d_inode(dentry))->xattr_sem); return ret; } /* * If the EXT4_FEATURE_COMPAT_EXT_ATTR feature of this file system is * not set, set it. */ static void ext4_xattr_update_super_block(handle_t *handle, struct super_block *sb) { if (ext4_has_feature_xattr(sb)) return; BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); if (ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE) == 0) { lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_xattr(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) { struct ext4_iloc iloc = { .bh = NULL }; struct buffer_head *bh = NULL; struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; qsize_t ea_inode_refs = 0; void *end; int ret; lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { ret = ext4_get_inode_loc(inode, &iloc); if (ret) goto out; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; ret = xattr_check_inode(inode, header, end); if (ret) goto out; for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) ea_inode_refs++; } if (EXT4_I(inode)->i_file_acl) { bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { ret = PTR_ERR(bh); bh = NULL; goto out; } ret = ext4_xattr_check_block(inode, bh); if (ret) goto out; for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) ea_inode_refs++; } *usage = ea_inode_refs + 1; ret = 0; out: brelse(iloc.bh); brelse(bh); return ret; } static inline size_t round_up_cluster(struct inode *inode, size_t length) { struct super_block *sb = inode->i_sb; size_t cluster_size = 1 << (EXT4_SB(sb)->s_cluster_bits + inode->i_blkbits); size_t mask = ~(cluster_size - 1); return (length + cluster_size - 1) & mask; } static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) { int err; err = dquot_alloc_inode(inode); if (err) return err; err = dquot_alloc_space_nodirty(inode, round_up_cluster(inode, len)); if (err) dquot_free_inode(inode); return err; } static void ext4_xattr_inode_free_quota(struct inode *parent, struct inode *ea_inode, size_t len) { if (ea_inode && ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) return; dquot_free_space_nodirty(parent, round_up_cluster(parent, len)); dquot_free_inode(parent); } int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, struct buffer_head *block_bh, size_t value_len, bool is_create) { int credits; int blocks; /* * 1) Owner inode update * 2) Ref count update on old xattr block * 3) new xattr block * 4) block bitmap update for new xattr block * 5) group descriptor for new xattr block * 6) block bitmap update for old xattr block * 7) group descriptor for old block * * 6 & 7 can happen if we have two racing threads T_a and T_b * which are each trying to set an xattr on inodes I_a and I_b * which were both initially sharing an xattr block. */ credits = 7; /* Quota updates. */ credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(sb); /* * In case of inline data, we may push out the data to a block, * so we need to reserve credits for this eventuality */ if (inode && ext4_has_inline_data(inode)) credits += ext4_writepage_trans_blocks(inode) + 1; /* We are done if ea_inode feature is not enabled. */ if (!ext4_has_feature_ea_inode(sb)) return credits; /* New ea_inode, inode map, block bitmap, group descriptor. */ credits += 4; /* Data blocks. */ blocks = (value_len + sb->s_blocksize - 1) >> sb->s_blocksize_bits; /* Indirection block or one level of extent tree. */ blocks += 1; /* Block bitmap and group descriptor updates for each block. */ credits += blocks * 2; /* Blocks themselves. */ credits += blocks; if (!is_create) { /* Dereference ea_inode holding old xattr value. * Old ea_inode, inode map, block bitmap, group descriptor. */ credits += 4; /* Data blocks for old ea_inode. */ blocks = XATTR_SIZE_MAX >> sb->s_blocksize_bits; /* Indirection block or one level of extent tree for old * ea_inode. */ blocks += 1; /* Block bitmap and group descriptor updates for each block. */ credits += blocks * 2; } /* We may need to clone the existing xattr block in which case we need * to increment ref counts for existing ea_inodes referenced by it. */ if (block_bh) { struct ext4_xattr_entry *entry = BFIRST(block_bh); for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) if (entry->e_value_inum) /* Ref count update on ea_inode. */ credits += 1; } return credits; } static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { struct ext4_iloc iloc; s64 ref_count; int ret; inode_lock_nested(ea_inode, I_MUTEX_XATTR); ret = ext4_reserve_inode_write(handle, ea_inode, &iloc); if (ret) goto out; ref_count = ext4_xattr_inode_get_ref(ea_inode); ref_count += ref_change; ext4_xattr_inode_set_ref(ea_inode, ref_count); if (ref_change > 0) { WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", ea_inode->i_ino, ref_count); if (ref_count == 1) { WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); set_nlink(ea_inode, 1); ext4_orphan_del(handle, ea_inode); } } else { WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", ea_inode->i_ino, ref_count); if (ref_count == 0) { WARN_ONCE(ea_inode->i_nlink != 1, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); clear_nlink(ea_inode); ext4_orphan_add(handle, ea_inode); } } ret = ext4_mark_iloc_dirty(handle, ea_inode, &iloc); if (ret) ext4_warning_inode(ea_inode, "ext4_mark_iloc_dirty() failed ret=%d", ret); out: inode_unlock(ea_inode); return ret; } static int ext4_xattr_inode_inc_ref(handle_t *handle, struct inode *ea_inode) { return ext4_xattr_inode_update_ref(handle, ea_inode, 1); } static int ext4_xattr_inode_dec_ref(handle_t *handle, struct inode *ea_inode) { return ext4_xattr_inode_update_ref(handle, ea_inode, -1); } static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, struct ext4_xattr_entry *first) { struct inode *ea_inode; struct ext4_xattr_entry *entry; struct ext4_xattr_entry *failed_entry; unsigned int ea_ino; int err, saved_err; for (entry = first; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) goto cleanup; err = ext4_xattr_inode_inc_ref(handle, ea_inode); if (err) { ext4_warning_inode(ea_inode, "inc ref error %d", err); iput(ea_inode); goto cleanup; } iput(ea_inode); } return 0; cleanup: saved_err = err; failed_entry = entry; for (entry = first; entry != failed_entry; entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ext4_warning(parent->i_sb, "cleanup ea_ino %u iget error %d", ea_ino, err); continue; } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); } return saved_err; } static int ext4_xattr_restart_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh, bool block_csum, bool dirty) { int error; if (bh && dirty) { if (block_csum) ext4_xattr_block_csum_set(inode, bh); error = ext4_handle_dirty_metadata(handle, NULL, bh); if (error) { ext4_warning(inode->i_sb, "Handle metadata (error %d)", error); return error; } } return 0; } static void ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, struct buffer_head *bh, struct ext4_xattr_entry *first, bool block_csum, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits, bool skip_quota) { struct inode *ea_inode; struct ext4_xattr_entry *entry; bool dirty = false; unsigned int ea_ino; int err; int credits; /* One credit for dec ref on ea_inode, one for orphan list addition, */ credits = 2 + extra_credits; for (entry = first; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); err = ext4_xattr_inode_iget(parent, ea_ino, le32_to_cpu(entry->e_hash), &ea_inode); if (err) continue; err = ext4_expand_inode_array(ea_inode_array, ea_inode); if (err) { ext4_warning_inode(ea_inode, "Expand inode array err=%d", err); iput(ea_inode); continue; } err = ext4_journal_ensure_credits_fn(handle, credits, credits, ext4_free_metadata_revoke_credits(parent->i_sb, 1), ext4_xattr_restart_fn(handle, parent, bh, block_csum, dirty)); if (err < 0) { ext4_warning_inode(ea_inode, "Ensure credits err=%d", err); continue; } if (err > 0) { err = ext4_journal_get_write_access(handle, parent->i_sb, bh, EXT4_JTR_NONE); if (err) { ext4_warning_inode(ea_inode, "Re-get write access err=%d", err); continue; } } err = ext4_xattr_inode_dec_ref(handle, ea_inode); if (err) { ext4_warning_inode(ea_inode, "ea_inode dec ref err=%d", err); continue; } if (!skip_quota) ext4_xattr_inode_free_quota(parent, ea_inode, le32_to_cpu(entry->e_value_size)); /* * Forget about ea_inode within the same transaction that * decrements the ref count. This avoids duplicate decrements in * case the rest of the work spills over to subsequent * transactions. */ entry->e_value_inum = 0; entry->e_value_size = 0; dirty = true; } if (dirty) { /* * Note that we are deliberately skipping csum calculation for * the final update because we do not expect any journal * restarts until xattr block is freed. */ err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) ext4_warning_inode(parent, "handle dirty metadata err=%d", err); } } /* * Release the xattr block BH: If the reference count is > 1, decrement it; * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, struct buffer_head *bh, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits) { struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); u32 hash, ref; int error = 0; BUFFER_TRACE(bh, "get_write_access"); error = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (error) goto out; retry_ref: lock_buffer(bh); hash = le32_to_cpu(BHDR(bh)->h_hash); ref = le32_to_cpu(BHDR(bh)->h_refcount); if (ref == 1) { ea_bdebug(bh, "refcount now=0; freeing"); /* * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect freed block */ if (ea_block_cache) { struct mb_cache_entry *oe; oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, bh->b_blocknr); if (oe) { unlock_buffer(bh); mb_cache_entry_wait_unused(oe); mb_cache_entry_put(ea_block_cache, oe); goto retry_ref; } } get_bh(bh); unlock_buffer(bh); if (ext4_has_feature_ea_inode(inode->i_sb)) ext4_xattr_inode_dec_ref_all(handle, inode, bh, BFIRST(bh), true /* block_csum */, ea_inode_array, extra_credits, true /* skip_quota */); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); } else { ref--; BHDR(bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX - 1) { struct mb_cache_entry *ce; if (ea_block_cache) { ce = mb_cache_entry_get(ea_block_cache, hash, bh->b_blocknr); if (ce) { set_bit(MBE_REUSABLE_B, &ce->e_flags); mb_cache_entry_put(ea_block_cache, ce); } } } ext4_xattr_block_csum_set(inode, bh); /* * Beware of this ugliness: Releasing of xattr block references * from different inodes can race and so we have to protect * from a race where someone else frees the block (and releases * its journal_head) before we are done dirtying the buffer. In * nojournal mode this race is harmless and we actually cannot * call ext4_handle_dirty_metadata() with locked buffer as * that function can call sync_dirty_buffer() so for that case * we handle the dirtying after unlocking the buffer. */ if (ext4_handle_valid(handle)) error = ext4_handle_dirty_metadata(handle, inode, bh); unlock_buffer(bh); if (!ext4_handle_valid(handle)) error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); } out: ext4_std_error(inode->i_sb, error); return; } /* * Find the available free space for EAs. This also returns the total number of * bytes used by EA entries. */ static size_t ext4_xattr_free_space(struct ext4_xattr_entry *last, size_t *min_offs, void *base, int *total) { for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < *min_offs) *min_offs = offs; } if (total) *total += EXT4_XATTR_LEN(last->e_name_len); } return (*min_offs - ((void *)last - base) - sizeof(__u32)); } /* * Write the value of the EA in an inode. */ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, const void *buf, int bufsize) { struct buffer_head *bh = NULL; unsigned long block = 0; int blocksize = ea_inode->i_sb->s_blocksize; int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; int csize, wsize = 0; int ret = 0, ret2 = 0; int retries = 0; retry: while (ret >= 0 && ret < max_blocks) { struct ext4_map_blocks map; map.m_lblk = block += ret; map.m_len = max_blocks -= ret; ret = ext4_map_blocks(handle, ea_inode, &map, EXT4_GET_BLOCKS_CREATE); if (ret <= 0) { ext4_mark_inode_dirty(handle, ea_inode); if (ret == -ENOSPC && ext4_should_retry_alloc(ea_inode->i_sb, &retries)) { ret = 0; goto retry; } break; } } if (ret < 0) return ret; block = 0; while (wsize < bufsize) { brelse(bh); csize = (bufsize - wsize) > blocksize ? blocksize : bufsize - wsize; bh = ext4_getblk(handle, ea_inode, block, 0); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) { WARN_ON_ONCE(1); EXT4_ERROR_INODE(ea_inode, "ext4_getblk() return bh = NULL"); return -EFSCORRUPTED; } ret = ext4_journal_get_write_access(handle, ea_inode->i_sb, bh, EXT4_JTR_NONE); if (ret) goto out; memcpy(bh->b_data, buf, csize); /* * Zero out block tail to avoid writing uninitialized memory * to disk. */ if (csize < blocksize) memset(bh->b_data + csize, 0, blocksize - csize); set_buffer_uptodate(bh); ext4_handle_dirty_metadata(handle, ea_inode, bh); buf += csize; wsize += csize; block += 1; } inode_lock(ea_inode); i_size_write(ea_inode, wsize); ext4_update_i_disksize(ea_inode, wsize); inode_unlock(ea_inode); ret2 = ext4_mark_inode_dirty(handle, ea_inode); if (unlikely(ret2 && !ret)) ret = ret2; out: brelse(bh); return ret; } /* * Create an inode to store the value of a large EA. */ static struct inode *ext4_xattr_inode_create(handle_t *handle, struct inode *inode, u32 hash) { struct inode *ea_inode = NULL; uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) }; int err; if (inode->i_sb->s_root == NULL) { ext4_warning(inode->i_sb, "refuse to create EA inode when umounting"); WARN_ON(1); return ERR_PTR(-EINVAL); } /* * Let the next inode be the goal, so we try and allocate the EA inode * in the same group, or nearby one. */ ea_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, S_IFREG | 0600, NULL, inode->i_ino + 1, owner, EXT4_EA_INODE_FL); if (!IS_ERR(ea_inode)) { ea_inode->i_op = &ext4_file_inode_operations; ea_inode->i_fop = &ext4_file_operations; ext4_set_aops(ea_inode); ext4_xattr_inode_set_class(ea_inode); unlock_new_inode(ea_inode); ext4_xattr_inode_set_ref(ea_inode, 1); ext4_xattr_inode_set_hash(ea_inode, hash); err = ext4_mark_inode_dirty(handle, ea_inode); if (!err) err = ext4_inode_attach_jinode(ea_inode); if (err) { if (ext4_xattr_inode_dec_ref(handle, ea_inode)) ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err); iput(ea_inode); return ERR_PTR(err); } /* * Xattr inodes are shared therefore quota charging is performed * at a higher level. */ dquot_free_inode(ea_inode); dquot_drop(ea_inode); inode_lock(ea_inode); ea_inode->i_flags |= S_NOQUOTA; inode_unlock(ea_inode); } return ea_inode; } static struct inode * ext4_xattr_inode_cache_find(struct inode *inode, const void *value, size_t value_len, u32 hash) { struct inode *ea_inode; struct mb_cache_entry *ce; struct mb_cache *ea_inode_cache = EA_INODE_CACHE(inode); void *ea_data; if (!ea_inode_cache) return NULL; ce = mb_cache_entry_find_first(ea_inode_cache, hash); if (!ce) return NULL; WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) && !(current->flags & PF_MEMALLOC_NOFS)); ea_data = kvmalloc(value_len, GFP_KERNEL); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); return NULL; } while (ce) { ea_inode = ext4_iget(inode->i_sb, ce->e_value, EXT4_IGET_EA_INODE); if (IS_ERR(ea_inode)) goto next_entry; ext4_xattr_inode_set_class(ea_inode); if (i_size_read(ea_inode) == value_len && !ext4_xattr_inode_read(ea_inode, ea_data, value_len) && !ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data, value_len) && !memcmp(value, ea_data, value_len)) { mb_cache_entry_touch(ea_inode_cache, ce); mb_cache_entry_put(ea_inode_cache, ce); kvfree(ea_data); return ea_inode; } iput(ea_inode); next_entry: ce = mb_cache_entry_find_next(ea_inode_cache, ce); } kvfree(ea_data); return NULL; } /* * Add value of the EA in an inode. */ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, const void *value, size_t value_len, struct inode **ret_inode) { struct inode *ea_inode; u32 hash; int err; hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len); ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash); if (ea_inode) { err = ext4_xattr_inode_inc_ref(handle, ea_inode); if (err) { iput(ea_inode); return err; } *ret_inode = ea_inode; return 0; } /* Create an inode for the EA value */ ea_inode = ext4_xattr_inode_create(handle, inode, hash); if (IS_ERR(ea_inode)) return PTR_ERR(ea_inode); err = ext4_xattr_inode_write(handle, ea_inode, value, value_len); if (err) { ext4_xattr_inode_dec_ref(handle, ea_inode); iput(ea_inode); return err; } if (EA_INODE_CACHE(inode)) mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash, ea_inode->i_ino, true /* reusable */); *ret_inode = ea_inode; return 0; } /* * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode * feature is enabled. */ #define EXT4_XATTR_BLOCK_RESERVE(inode) min(i_blocksize(inode)/8, 1024U) static int ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s, handle_t *handle, struct inode *inode, bool is_block) { struct ext4_xattr_entry *last, *next; struct ext4_xattr_entry *here = s->here; size_t min_offs = s->end - s->base, name_len = strlen(i->name); int in_inode = i->in_inode; struct inode *old_ea_inode = NULL; struct inode *new_ea_inode = NULL; size_t old_size, new_size; int ret; /* Space used by old and new values. */ old_size = (!s->not_found && !here->e_value_inum) ? EXT4_XATTR_SIZE(le32_to_cpu(here->e_value_size)) : 0; new_size = (i->value && !in_inode) ? EXT4_XATTR_SIZE(i->value_len) : 0; /* * Optimization for the simple case when old and new values have the * same padded sizes. Not applicable if external inodes are involved. */ if (new_size && new_size == old_size) { size_t offs = le16_to_cpu(here->e_value_offs); void *val = s->base + offs; here->e_value_size = cpu_to_le32(i->value_len); if (i->value == EXT4_ZERO_XATTR_VALUE) { memset(val, 0, new_size); } else { memcpy(val, i->value, i->value_len); /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } goto update_hash; } /* Compute min_offs and last. */ last = s->first; for (; !IS_LAST_ENTRY(last); last = next) { next = EXT4_XATTR_NEXT(last); if ((void *)next >= s->end) { EXT4_ERROR_INODE(inode, "corrupted xattr entries"); ret = -EFSCORRUPTED; goto out; } if (!last->e_value_inum && last->e_value_size) { size_t offs = le16_to_cpu(last->e_value_offs); if (offs < min_offs) min_offs = offs; } } /* Check whether we have enough space. */ if (i->value) { size_t free; free = min_offs - ((void *)last - s->base) - sizeof(__u32); if (!s->not_found) free += EXT4_XATTR_LEN(name_len) + old_size; if (free < EXT4_XATTR_LEN(name_len) + new_size) { ret = -ENOSPC; goto out; } /* * If storing the value in an external inode is an option, * reserve space for xattr entries/names in the external * attribute block so that a long value does not occupy the * whole space and prevent further entries being added. */ if (ext4_has_feature_ea_inode(inode->i_sb) && new_size && is_block && (min_offs + old_size - new_size) < EXT4_XATTR_BLOCK_RESERVE(inode)) { ret = -ENOSPC; goto out; } } /* * Getting access to old and new ea inodes is subject to failures. * Finish that work before doing any modifications to the xattr data. */ if (!s->not_found && here->e_value_inum) { ret = ext4_xattr_inode_iget(inode, le32_to_cpu(here->e_value_inum), le32_to_cpu(here->e_hash), &old_ea_inode); if (ret) { old_ea_inode = NULL; goto out; } } if (i->value && in_inode) { WARN_ON_ONCE(!i->value_len); ret = ext4_xattr_inode_alloc_quota(inode, i->value_len); if (ret) goto out; ret = ext4_xattr_inode_lookup_create(handle, inode, i->value, i->value_len, &new_ea_inode); if (ret) { new_ea_inode = NULL; ext4_xattr_inode_free_quota(inode, NULL, i->value_len); goto out; } } if (old_ea_inode) { /* We are ready to release ref count on the old_ea_inode. */ ret = ext4_xattr_inode_dec_ref(handle, old_ea_inode); if (ret) { /* Release newly required ref count on new_ea_inode. */ if (new_ea_inode) { int err; err = ext4_xattr_inode_dec_ref(handle, new_ea_inode); if (err) ext4_warning_inode(new_ea_inode, "dec ref new_ea_inode err=%d", err); ext4_xattr_inode_free_quota(inode, new_ea_inode, i->value_len); } goto out; } ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); } /* No failures allowed past this point. */ if (!s->not_found && here->e_value_size && !here->e_value_inum) { /* Remove the old value. */ void *first_val = s->base + min_offs; size_t offs = le16_to_cpu(here->e_value_offs); void *val = s->base + offs; memmove(first_val + old_size, first_val, val - first_val); memset(first_val, 0, old_size); min_offs += old_size; /* Adjust all value offsets. */ last = s->first; while (!IS_LAST_ENTRY(last)) { size_t o = le16_to_cpu(last->e_value_offs); if (!last->e_value_inum && last->e_value_size && o < offs) last->e_value_offs = cpu_to_le16(o + old_size); last = EXT4_XATTR_NEXT(last); } } if (!i->value) { /* Remove old name. */ size_t size = EXT4_XATTR_LEN(name_len); last = ENTRY((void *)last - size); memmove(here, (void *)here + size, (void *)last - (void *)here + sizeof(__u32)); memset(last, 0, size); /* * Update i_inline_off - moved ibody region might contain * system.data attribute. Handling a failure here won't * cause other complications for setting an xattr. */ if (!is_block && ext4_has_inline_data(inode)) { ret = ext4_find_inline_data_nolock(inode); if (ret) { ext4_warning_inode(inode, "unable to update i_inline_off"); goto out; } } } else if (s->not_found) { /* Insert new name. */ size_t size = EXT4_XATTR_LEN(name_len); size_t rest = (void *)last - (void *)here + sizeof(__u32); memmove((void *)here + size, here, rest); memset(here, 0, size); here->e_name_index = i->name_index; here->e_name_len = name_len; memcpy(here->e_name, i->name, name_len); } else { /* This is an update, reset value info. */ here->e_value_inum = 0; here->e_value_offs = 0; here->e_value_size = 0; } if (i->value) { /* Insert new value. */ if (in_inode) { here->e_value_inum = cpu_to_le32(new_ea_inode->i_ino); } else if (i->value_len) { void *val = s->base + min_offs - new_size; here->e_value_offs = cpu_to_le16(min_offs - new_size); if (i->value == EXT4_ZERO_XATTR_VALUE) { memset(val, 0, new_size); } else { memcpy(val, i->value, i->value_len); /* Clear padding bytes. */ memset(val + i->value_len, 0, new_size - i->value_len); } } here->e_value_size = cpu_to_le32(i->value_len); } update_hash: if (i->value) { __le32 hash = 0; /* Entry hash calculation. */ if (in_inode) { __le32 crc32c_hash; /* * Feed crc32c hash instead of the raw value for entry * hash calculation. This is to avoid walking * potentially long value buffer again. */ crc32c_hash = cpu_to_le32( ext4_xattr_inode_get_hash(new_ea_inode)); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, &crc32c_hash, 1); } else if (is_block) { __le32 *value = s->base + le16_to_cpu( here->e_value_offs); hash = ext4_xattr_hash_entry(here->e_name, here->e_name_len, value, new_size >> 2); } here->e_hash = hash; } if (is_block) ext4_xattr_rehash((struct ext4_xattr_header *)s->base); ret = 0; out: iput(old_ea_inode); iput(new_ea_inode); return ret; } struct ext4_xattr_block_find { struct ext4_xattr_search s; struct buffer_head *bh; }; static int ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_block_find *bs) { struct super_block *sb = inode->i_sb; int error; ea_idebug(inode, "name=%d.%s, value=%p, value_len=%ld", i->name_index, i->name, i->value, (long)i->value_len); if (EXT4_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bs->bh)) { error = PTR_ERR(bs->bh); bs->bh = NULL; return error; } ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); error = ext4_xattr_check_block(inode, bs->bh); if (error) return error; /* Find the named attribute. */ bs->s.base = BHDR(bs->bh); bs->s.first = BFIRST(bs->bh); bs->s.end = bs->bh->b_data + bs->bh->b_size; bs->s.here = bs->s.first; error = xattr_find_entry(inode, &bs->s.here, bs->s.end, i->name_index, i->name, 1); if (error && error != -ENODATA) return error; bs->s.not_found = error; } return 0; } static int ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_block_find *bs) { struct super_block *sb = inode->i_sb; struct buffer_head *new_bh = NULL; struct ext4_xattr_search s_copy = bs->s; struct ext4_xattr_search *s = &s_copy; struct mb_cache_entry *ce = NULL; int error = 0; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); struct inode *ea_inode = NULL, *tmp_inode; size_t old_ea_inode_quota = 0; unsigned int ea_ino; #define header(x) ((struct ext4_xattr_header *)(x)) if (s->base) { int offset = (char *)s->here - bs->bh->b_data; BUFFER_TRACE(bs->bh, "get_write_access"); error = ext4_journal_get_write_access(handle, sb, bs->bh, EXT4_JTR_NONE); if (error) goto cleanup; lock_buffer(bs->bh); if (header(s->base)->h_refcount == cpu_to_le32(1)) { __u32 hash = le32_to_cpu(BHDR(bs->bh)->h_hash); /* * This must happen under buffer lock for * ext4_xattr_block_set() to reliably detect modified * block */ if (ea_block_cache) { struct mb_cache_entry *oe; oe = mb_cache_entry_delete_or_get(ea_block_cache, hash, bs->bh->b_blocknr); if (oe) { /* * Xattr block is getting reused. Leave * it alone. */ mb_cache_entry_put(ea_block_cache, oe); goto clone_block; } } ea_bdebug(bs->bh, "modifying in-place"); error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); ext4_xattr_block_csum_set(inode, bs->bh); unlock_buffer(bs->bh); if (error == -EFSCORRUPTED) goto bad_block; if (!error) error = ext4_handle_dirty_metadata(handle, inode, bs->bh); if (error) goto cleanup; goto inserted; } clone_block: unlock_buffer(bs->bh); ea_bdebug(bs->bh, "cloning"); s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; s->first = ENTRY(header(s->base)+1); header(s->base)->h_refcount = cpu_to_le32(1); s->here = ENTRY(s->base + offset); s->end = s->base + bs->bh->b_size; /* * If existing entry points to an xattr inode, we need * to prevent ext4_xattr_set_entry() from decrementing * ref count on it because the reference belongs to the * original block. In this case, make the entry look * like it has an empty value. */ if (!s->not_found && s->here->e_value_inum) { ea_ino = le32_to_cpu(s->here->e_value_inum); error = ext4_xattr_inode_iget(inode, ea_ino, le32_to_cpu(s->here->e_hash), &tmp_inode); if (error) goto cleanup; if (!ext4_test_inode_state(tmp_inode, EXT4_STATE_LUSTRE_EA_INODE)) { /* * Defer quota free call for previous * inode until success is guaranteed. */ old_ea_inode_quota = le32_to_cpu( s->here->e_value_size); } iput(tmp_inode); s->here->e_value_inum = 0; s->here->e_value_size = 0; } } else { /* Allocate a buffer where we construct the new block. */ s->base = kzalloc(sb->s_blocksize, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; header(s->base)->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); header(s->base)->h_blocks = cpu_to_le32(1); header(s->base)->h_refcount = cpu_to_le32(1); s->first = ENTRY(header(s->base)+1); s->here = ENTRY(header(s->base)+1); s->end = s->base + sb->s_blocksize; } error = ext4_xattr_set_entry(i, s, handle, inode, true /* is_block */); if (error == -EFSCORRUPTED) goto bad_block; if (error) goto cleanup; if (i->value && s->here->e_value_inum) { /* * A ref count on ea_inode has been taken as part of the call to * ext4_xattr_set_entry() above. We would like to drop this * extra ref but we have to wait until the xattr block is * initialized and has its own ref count on the ea_inode. */ ea_ino = le32_to_cpu(s->here->e_value_inum); error = ext4_xattr_inode_iget(inode, ea_ino, le32_to_cpu(s->here->e_hash), &ea_inode); if (error) { ea_inode = NULL; goto cleanup; } } inserted: if (!IS_LAST_ENTRY(s->first)) { new_bh = ext4_xattr_block_cache_find(inode, header(s->base), &ce); if (new_bh) { /* We found an identical block in the cache. */ if (new_bh == bs->bh) ea_bdebug(new_bh, "keeping"); else { u32 ref; #ifdef EXT4_XATTR_DEBUG WARN_ON_ONCE(dquot_initialize_needed(inode)); #endif /* The old block is released after updating the inode. */ error = dquot_alloc_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); if (error) goto cleanup; BUFFER_TRACE(new_bh, "get_write_access"); error = ext4_journal_get_write_access( handle, sb, new_bh, EXT4_JTR_NONE); if (error) goto cleanup_dquot; lock_buffer(new_bh); /* * We have to be careful about races with * adding references to xattr block. Once we * hold buffer lock xattr block's state is * stable so we can check the additional * reference fits. */ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1; if (ref > EXT4_XATTR_REFCOUNT_MAX) { /* * Undo everything and check mbcache * again. */ unlock_buffer(new_bh); dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); brelse(new_bh); mb_cache_entry_put(ea_block_cache, ce); ce = NULL; new_bh = NULL; goto inserted; } BHDR(new_bh)->h_refcount = cpu_to_le32(ref); if (ref == EXT4_XATTR_REFCOUNT_MAX) clear_bit(MBE_REUSABLE_B, &ce->e_flags); ea_bdebug(new_bh, "reusing; refcount now=%d", ref); ext4_xattr_block_csum_set(inode, new_bh); unlock_buffer(new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) goto cleanup_dquot; } mb_cache_entry_touch(ea_block_cache, ce); mb_cache_entry_put(ea_block_cache, ce); ce = NULL; } else if (bs->bh && s->base == bs->bh->b_data) { /* We were modifying this block in-place. */ ea_bdebug(bs->bh, "keeping this block"); ext4_xattr_block_cache_insert(ea_block_cache, bs->bh); new_bh = bs->bh; get_bh(new_bh); } else { /* We need to allocate a new block */ ext4_fsblk_t goal, block; #ifdef EXT4_XATTR_DEBUG WARN_ON_ONCE(dquot_initialize_needed(inode)); #endif goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); if (error) goto cleanup; ea_idebug(inode, "creating block %llu", (unsigned long long)block); new_bh = sb_getblk(sb, block); if (unlikely(!new_bh)) { error = -ENOMEM; getblk_failed: ext4_free_blocks(handle, inode, NULL, block, 1, EXT4_FREE_BLOCKS_METADATA); goto cleanup; } error = ext4_xattr_inode_inc_ref_all(handle, inode, ENTRY(header(s->base)+1)); if (error) goto getblk_failed; if (ea_inode) { /* Drop the extra ref on ea_inode. */ error = ext4_xattr_inode_dec_ref(handle, ea_inode); if (error) ext4_warning_inode(ea_inode, "dec ref error=%d", error); iput(ea_inode); ea_inode = NULL; } lock_buffer(new_bh); error = ext4_journal_get_create_access(handle, sb, new_bh, EXT4_JTR_NONE); if (error) { unlock_buffer(new_bh); error = -EIO; goto getblk_failed; } memcpy(new_bh->b_data, s->base, new_bh->b_size); ext4_xattr_block_csum_set(inode, new_bh); set_buffer_uptodate(new_bh); unlock_buffer(new_bh); ext4_xattr_block_cache_insert(ea_block_cache, new_bh); error = ext4_handle_dirty_metadata(handle, inode, new_bh); if (error) goto cleanup; } } if (old_ea_inode_quota) ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota); /* Update the inode. */ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; /* Drop the previous xattr block. */ if (bs->bh && bs->bh != new_bh) { struct ext4_xattr_inode_array *ea_inode_array = NULL; ext4_xattr_release_block(handle, inode, bs->bh, &ea_inode_array, 0 /* extra_credits */); ext4_xattr_inode_array_free(ea_inode_array); } error = 0; cleanup: if (ea_inode) { int error2; error2 = ext4_xattr_inode_dec_ref(handle, ea_inode); if (error2) ext4_warning_inode(ea_inode, "dec ref error=%d", error2); /* If there was an error, revert the quota charge. */ if (error) ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); iput(ea_inode); } if (ce) mb_cache_entry_put(ea_block_cache, ce); brelse(new_bh); if (!(bs->bh && s->base == bs->bh->b_data)) kfree(s->base); return error; cleanup_dquot: dquot_free_block(inode, EXT4_C2B(EXT4_SB(sb), 1)); goto cleanup; bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); goto cleanup; #undef header } int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return 0; raw_inode = ext4_raw_inode(&is->iloc); header = IHDR(inode, raw_inode); is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { error = xattr_check_inode(inode, header, is->s.end); if (error) return error; /* Find the named attribute. */ error = xattr_find_entry(inode, &is->s.here, is->s.end, i->name_index, i->name, 0); if (error && error != -ENODATA) return error; is->s.not_found = error; } return 0; } int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { struct ext4_xattr_ibody_header *header; struct ext4_xattr_search *s = &is->s; int error; if (!EXT4_INODE_HAS_XATTR_SPACE(inode)) return -ENOSPC; error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */); if (error) return error; header = IHDR(inode, ext4_raw_inode(&is->iloc)); if (!IS_LAST_ENTRY(s->first)) { header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC); ext4_set_inode_state(inode, EXT4_STATE_XATTR); } else { header->h_magic = cpu_to_le32(0); ext4_clear_inode_state(inode, EXT4_STATE_XATTR); } return 0; } static int ext4_xattr_value_same(struct ext4_xattr_search *s, struct ext4_xattr_info *i) { void *value; /* When e_value_inum is set the value is stored externally. */ if (s->here->e_value_inum) return 0; if (le32_to_cpu(s->here->e_value_size) != i->value_len) return 0; value = ((void *)s->base) + le16_to_cpu(s->here->e_value_offs); return !memcmp(value, i->value, i->value_len); } static struct buffer_head *ext4_xattr_get_block(struct inode *inode) { struct buffer_head *bh; int error; if (!EXT4_I(inode)->i_file_acl) return NULL; bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) return bh; error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); return ERR_PTR(error); } return bh; } /* * ext4_xattr_set_handle() * * Create, replace or remove an extended attribute for this inode. Value * is NULL to remove an existing extended attribute, and non-NULL to * either replace an existing extended attribute, or create a new extended * attribute. The flags XATTR_REPLACE and XATTR_CREATE * specify that an extended attribute must exist and must not exist * previous to the call, respectively. * * Returns 0, or a negative error number on failure. */ int ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { struct ext4_xattr_info i = { .name_index = name_index, .name = name, .value = value, .value_len = value_len, .in_inode = 0, }; struct ext4_xattr_ibody_find is = { .s = { .not_found = -ENODATA, }, }; struct ext4_xattr_block_find bs = { .s = { .not_found = -ENODATA, }, }; int no_expand; int error; if (!name) return -EINVAL; if (strlen(name) > 255) return -ERANGE; ext4_write_lock_xattr(inode, &no_expand); /* Check journal credits under write lock. */ if (ext4_handle_valid(handle)) { struct buffer_head *bh; int credits; bh = ext4_xattr_get_block(inode); if (IS_ERR(bh)) { error = PTR_ERR(bh); goto cleanup; } credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, value_len, flags & XATTR_CREATE); brelse(bh); if (jbd2_handle_buffer_credits(handle) < credits) { error = -ENOSPC; goto cleanup; } WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS)); } error = ext4_reserve_inode_write(handle, inode, &is.iloc); if (error) goto cleanup; if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) { struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); ext4_clear_inode_state(inode, EXT4_STATE_NEW); } error = ext4_xattr_ibody_find(inode, &i, &is); if (error) goto cleanup; if (is.s.not_found) error = ext4_xattr_block_find(inode, &i, &bs); if (error) goto cleanup; if (is.s.not_found && bs.s.not_found) { error = -ENODATA; if (flags & XATTR_REPLACE) goto cleanup; error = 0; if (!value) goto cleanup; } else { error = -EEXIST; if (flags & XATTR_CREATE) goto cleanup; } if (!value) { if (!is.s.not_found) error = ext4_xattr_ibody_set(handle, inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { error = 0; /* Xattr value did not change? Save us some work and bail out */ if (!is.s.not_found && ext4_xattr_value_same(&is.s, &i)) goto cleanup; if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; if (ext4_has_feature_ea_inode(inode->i_sb) && (EXT4_XATTR_SIZE(i.value_len) > EXT4_XATTR_MIN_LARGE_EA_SIZE(inode->i_sb->s_blocksize))) i.in_inode = 1; retry_inode: error = ext4_xattr_ibody_set(handle, inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); } else if (error == -ENOSPC) { if (EXT4_I(inode)->i_file_acl && !bs.s.base) { brelse(bs.bh); bs.bh = NULL; error = ext4_xattr_block_find(inode, &i, &bs); if (error) goto cleanup; } error = ext4_xattr_block_set(handle, inode, &i, &bs); if (!error && !is.s.not_found) { i.value = NULL; error = ext4_xattr_ibody_set(handle, inode, &i, &is); } else if (error == -ENOSPC) { /* * Xattr does not fit in the block, store at * external inode if possible. */ if (ext4_has_feature_ea_inode(inode->i_sb) && i.value_len && !i.in_inode) { i.in_inode = 1; goto retry_inode; } } } } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); inode->i_ctime = current_time(inode); if (!value) no_expand = 0; error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); /* * The bh is consumed by ext4_mark_iloc_dirty, even with * error != 0. */ is.iloc.bh = NULL; if (IS_SYNC(inode)) ext4_handle_sync(handle); } ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); cleanup: brelse(is.iloc.bh); brelse(bs.bh); ext4_write_unlock_xattr(inode, &no_expand); return error; } int ext4_xattr_set_credits(struct inode *inode, size_t value_len, bool is_create, int *credits) { struct buffer_head *bh; int err; *credits = 0; if (!EXT4_SB(inode->i_sb)->s_journal) return 0; down_read(&EXT4_I(inode)->xattr_sem); bh = ext4_xattr_get_block(inode); if (IS_ERR(bh)) { err = PTR_ERR(bh); } else { *credits = __ext4_xattr_set_credits(inode->i_sb, inode, bh, value_len, is_create); brelse(bh); err = 0; } up_read(&EXT4_I(inode)->xattr_sem); return err; } /* * ext4_xattr_set() * * Like ext4_xattr_set_handle, but start from an inode. This extended * attribute modification is a filesystem transaction by itself. * * Returns 0, or a negative error number on failure. */ int ext4_xattr_set(struct inode *inode, int name_index, const char *name, const void *value, size_t value_len, int flags) { handle_t *handle; struct super_block *sb = inode->i_sb; int error, retries = 0; int credits; error = dquot_initialize(inode); if (error) return error; retry: error = ext4_xattr_set_credits(inode, value_len, flags & XATTR_CREATE, &credits); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) { error = PTR_ERR(handle); } else { int error2; error = ext4_xattr_set_handle(handle, inode, name_index, name, value, value_len, flags); ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); error2 = ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto retry; if (error == 0) error = error2; } return error; } /* * Shift the EA entries in the inode to create space for the increased * i_extra_isize. */ static void ext4_xattr_shift_entries(struct ext4_xattr_entry *entry, int value_offs_shift, void *to, void *from, size_t n) { struct ext4_xattr_entry *last = entry; int new_offs; /* We always shift xattr headers further thus offsets get lower */ BUG_ON(value_offs_shift > 0); /* Adjust the value offsets of the entries */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { if (!last->e_value_inum && last->e_value_size) { new_offs = le16_to_cpu(last->e_value_offs) + value_offs_shift; last->e_value_offs = cpu_to_le16(new_offs); } } /* Shift the entries by n bytes */ memmove(to, from, n); } /* * Move xattr pointed to by 'entry' from inode into external xattr block */ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, struct ext4_inode *raw_inode, struct ext4_xattr_entry *entry) { struct ext4_xattr_ibody_find *is = NULL; struct ext4_xattr_block_find *bs = NULL; char *buffer = NULL, *b_entry_name = NULL; size_t value_size = le32_to_cpu(entry->e_value_size); struct ext4_xattr_info i = { .value = NULL, .value_len = 0, .name_index = entry->e_name_index, .in_inode = !!entry->e_value_inum, }; struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); int needs_kvfree = 0; int error; is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS); bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS); b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS); if (!is || !bs || !b_entry_name) { error = -ENOMEM; goto out; } is->s.not_found = -ENODATA; bs->s.not_found = -ENODATA; is->iloc.bh = NULL; bs->bh = NULL; /* Save the entry name and the entry value */ if (entry->e_value_inum) { buffer = kvmalloc(value_size, GFP_NOFS); if (!buffer) { error = -ENOMEM; goto out; } needs_kvfree = 1; error = ext4_xattr_inode_get(inode, entry, buffer, value_size); if (error) goto out; } else { size_t value_offs = le16_to_cpu(entry->e_value_offs); buffer = (void *)IFIRST(header) + value_offs; } memcpy(b_entry_name, entry->e_name, entry->e_name_len); b_entry_name[entry->e_name_len] = '\0'; i.name = b_entry_name; error = ext4_get_inode_loc(inode, &is->iloc); if (error) goto out; error = ext4_xattr_ibody_find(inode, &i, is); if (error) goto out; i.value = buffer; i.value_len = value_size; error = ext4_xattr_block_find(inode, &i, bs); if (error) goto out; /* Move ea entry from the inode into the block */ error = ext4_xattr_block_set(handle, inode, &i, bs); if (error) goto out; /* Remove the chosen entry from the inode */ i.value = NULL; i.value_len = 0; error = ext4_xattr_ibody_set(handle, inode, &i, is); out: kfree(b_entry_name); if (needs_kvfree && buffer) kvfree(buffer); if (is) brelse(is->iloc.bh); if (bs) brelse(bs->bh); kfree(is); kfree(bs); return error; } static int ext4_xattr_make_inode_space(handle_t *handle, struct inode *inode, struct ext4_inode *raw_inode, int isize_diff, size_t ifree, size_t bfree, int *total_ino) { struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode); struct ext4_xattr_entry *small_entry; struct ext4_xattr_entry *entry; struct ext4_xattr_entry *last; unsigned int entry_size; /* EA entry size */ unsigned int total_size; /* EA entry size + value size */ unsigned int min_total_size; int error; while (isize_diff > ifree) { entry = NULL; small_entry = NULL; min_total_size = ~0U; last = IFIRST(header); /* Find the entry best suited to be pushed into EA block */ for (; !IS_LAST_ENTRY(last); last = EXT4_XATTR_NEXT(last)) { /* never move system.data out of the inode */ if ((last->e_name_len == 4) && (last->e_name_index == EXT4_XATTR_INDEX_SYSTEM) && !memcmp(last->e_name, "data", 4)) continue; total_size = EXT4_XATTR_LEN(last->e_name_len); if (!last->e_value_inum) total_size += EXT4_XATTR_SIZE( le32_to_cpu(last->e_value_size)); if (total_size <= bfree && total_size < min_total_size) { if (total_size + ifree < isize_diff) { small_entry = last; } else { entry = last; min_total_size = total_size; } } } if (entry == NULL) { if (small_entry == NULL) return -ENOSPC; entry = small_entry; } entry_size = EXT4_XATTR_LEN(entry->e_name_len); total_size = entry_size; if (!entry->e_value_inum) total_size += EXT4_XATTR_SIZE( le32_to_cpu(entry->e_value_size)); error = ext4_xattr_move_to_block(handle, inode, raw_inode, entry); if (error) return error; *total_ino -= entry_size; ifree += total_size; bfree -= total_size; } return 0; } /* * Expand an inode by new_extra_isize bytes when EAs are present. * Returns 0 on success or negative error number on failure. */ int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle) { struct ext4_xattr_ibody_header *header; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); static unsigned int mnt_count; size_t min_offs; size_t ifree, bfree; int total_ino; void *base, *end; int error = 0, tried_min_extra_isize = 0; int s_min_extra_isize = le16_to_cpu(sbi->s_es->s_min_extra_isize); int isize_diff; /* How much do we need to grow i_extra_isize */ retry: isize_diff = new_extra_isize - EXT4_I(inode)->i_extra_isize; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) return 0; header = IHDR(inode, raw_inode); /* * Check if enough free space is available in the inode to shift the * entries ahead by new_extra_isize. */ base = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; min_offs = end - base; total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); error = xattr_check_inode(inode, header, end); if (error) goto cleanup; ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); if (ifree >= isize_diff) goto shift; /* * Enough free space isn't available in the inode, check if * EA block can hold new_extra_isize bytes. */ if (EXT4_I(inode)->i_file_acl) { struct buffer_head *bh; bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); goto cleanup; } error = ext4_xattr_check_block(inode, bh); if (error) { brelse(bh); goto cleanup; } base = BHDR(bh); end = bh->b_data + bh->b_size; min_offs = end - base; bfree = ext4_xattr_free_space(BFIRST(bh), &min_offs, base, NULL); brelse(bh); if (bfree + ifree < isize_diff) { if (!tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; goto retry; } error = -ENOSPC; goto cleanup; } } else { bfree = inode->i_sb->s_blocksize; } error = ext4_xattr_make_inode_space(handle, inode, raw_inode, isize_diff, ifree, bfree, &total_ino); if (error) { if (error == -ENOSPC && !tried_min_extra_isize && s_min_extra_isize) { tried_min_extra_isize++; new_extra_isize = s_min_extra_isize; goto retry; } goto cleanup; } shift: /* Adjust the offsets and shift the remaining entries ahead */ ext4_xattr_shift_entries(IFIRST(header), EXT4_I(inode)->i_extra_isize - new_extra_isize, (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + new_extra_isize, (void *)header, total_ino); EXT4_I(inode)->i_extra_isize = new_extra_isize; if (ext4_has_inline_data(inode)) error = ext4_find_inline_data_nolock(inode); cleanup: if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) { ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.", inode->i_ino); mnt_count = le16_to_cpu(sbi->s_es->s_mnt_count); } return error; } #define EIA_INCR 16 /* must be 2^n */ #define EIA_MASK (EIA_INCR - 1) /* Add the large xattr @inode into @ea_inode_array for deferred iput(). * If @ea_inode_array is new or full it will be grown and the old * contents copied over. */ static int ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array, struct inode *inode) { if (*ea_inode_array == NULL) { /* * Start with 15 inodes, so it fits into a power-of-two size. * If *ea_inode_array is NULL, this is essentially offsetof() */ (*ea_inode_array) = kmalloc(offsetof(struct ext4_xattr_inode_array, inodes[EIA_MASK]), GFP_NOFS); if (*ea_inode_array == NULL) return -ENOMEM; (*ea_inode_array)->count = 0; } else if (((*ea_inode_array)->count & EIA_MASK) == EIA_MASK) { /* expand the array once all 15 + n * 16 slots are full */ struct ext4_xattr_inode_array *new_array = NULL; int count = (*ea_inode_array)->count; /* if new_array is NULL, this is essentially offsetof() */ new_array = kmalloc( offsetof(struct ext4_xattr_inode_array, inodes[count + EIA_INCR]), GFP_NOFS); if (new_array == NULL) return -ENOMEM; memcpy(new_array, *ea_inode_array, offsetof(struct ext4_xattr_inode_array, inodes[count])); kfree(*ea_inode_array); *ea_inode_array = new_array; } (*ea_inode_array)->inodes[(*ea_inode_array)->count++] = inode; return 0; } /* * ext4_xattr_delete_inode() * * Free extended attribute resources associated with this inode. Traverse * all entries and decrement reference on any xattr inodes associated with this * inode. This is called immediately before an inode is freed. We have exclusive * access to the inode. If an orphan inode is deleted it will also release its * references on xattr block and xattr inodes. */ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_inode_array **ea_inode_array, int extra_credits) { struct buffer_head *bh = NULL; struct ext4_xattr_ibody_header *header; struct ext4_iloc iloc = { .bh = NULL }; struct ext4_xattr_entry *entry; struct inode *ea_inode; int error; error = ext4_journal_ensure_credits(handle, extra_credits, ext4_free_metadata_revoke_credits(inode->i_sb, 1)); if (error < 0) { EXT4_ERROR_INODE(inode, "ensure credits (error %d)", error); goto cleanup; } if (ext4_has_feature_ea_inode(inode->i_sb) && ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { error = ext4_get_inode_loc(inode, &iloc); if (error) { EXT4_ERROR_INODE(inode, "inode loc (error %d)", error); goto cleanup; } error = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); if (error) { EXT4_ERROR_INODE(inode, "write access (error %d)", error); goto cleanup; } header = IHDR(inode, ext4_raw_inode(&iloc)); if (header->h_magic == cpu_to_le32(EXT4_XATTR_MAGIC)) ext4_xattr_inode_dec_ref_all(handle, inode, iloc.bh, IFIRST(header), false /* block_csum */, ea_inode_array, extra_credits, false /* skip_quota */); } if (EXT4_I(inode)->i_file_acl) { bh = ext4_sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); if (IS_ERR(bh)) { error = PTR_ERR(bh); if (error == -EIO) { EXT4_ERROR_INODE_ERR(inode, EIO, "block %llu read error", EXT4_I(inode)->i_file_acl); } bh = NULL; goto cleanup; } error = ext4_xattr_check_block(inode, bh); if (error) goto cleanup; if (ext4_has_feature_ea_inode(inode->i_sb)) { for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; error = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), le32_to_cpu(entry->e_hash), &ea_inode); if (error) continue; ext4_xattr_inode_free_quota(inode, ea_inode, le32_to_cpu(entry->e_value_size)); iput(ea_inode); } } ext4_xattr_release_block(handle, inode, bh, ea_inode_array, extra_credits); /* * Update i_file_acl value in the same transaction that releases * block. */ EXT4_I(inode)->i_file_acl = 0; error = ext4_mark_inode_dirty(handle, inode); if (error) { EXT4_ERROR_INODE(inode, "mark inode dirty (error %d)", error); goto cleanup; } ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_XATTR, handle); } error = 0; cleanup: brelse(iloc.bh); brelse(bh); return error; } void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *ea_inode_array) { int idx; if (ea_inode_array == NULL) return; for (idx = 0; idx < ea_inode_array->count; ++idx) iput(ea_inode_array->inodes[idx]); kfree(ea_inode_array); } /* * ext4_xattr_block_cache_insert() * * Create a new entry in the extended attribute block cache, and insert * it unless such an entry is already in the cache. * * Returns 0, or a negative error number on failure. */ static void ext4_xattr_block_cache_insert(struct mb_cache *ea_block_cache, struct buffer_head *bh) { struct ext4_xattr_header *header = BHDR(bh); __u32 hash = le32_to_cpu(header->h_hash); int reusable = le32_to_cpu(header->h_refcount) < EXT4_XATTR_REFCOUNT_MAX; int error; if (!ea_block_cache) return; error = mb_cache_entry_create(ea_block_cache, GFP_NOFS, hash, bh->b_blocknr, reusable); if (error) { if (error == -EBUSY) ea_bdebug(bh, "already in cache"); } else ea_bdebug(bh, "inserting [%x]", (int)hash); } /* * ext4_xattr_cmp() * * Compare two extended attribute blocks for equality. * * Returns 0 if the blocks are equal, 1 if they differ, and * a negative error number on errors. */ static int ext4_xattr_cmp(struct ext4_xattr_header *header1, struct ext4_xattr_header *header2) { struct ext4_xattr_entry *entry1, *entry2; entry1 = ENTRY(header1+1); entry2 = ENTRY(header2+1); while (!IS_LAST_ENTRY(entry1)) { if (IS_LAST_ENTRY(entry2)) return 1; if (entry1->e_hash != entry2->e_hash || entry1->e_name_index != entry2->e_name_index || entry1->e_name_len != entry2->e_name_len || entry1->e_value_size != entry2->e_value_size || entry1->e_value_inum != entry2->e_value_inum || memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len)) return 1; if (!entry1->e_value_inum && memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs), (char *)header2 + le16_to_cpu(entry2->e_value_offs), le32_to_cpu(entry1->e_value_size))) return 1; entry1 = EXT4_XATTR_NEXT(entry1); entry2 = EXT4_XATTR_NEXT(entry2); } if (!IS_LAST_ENTRY(entry2)) return 1; return 0; } /* * ext4_xattr_block_cache_find() * * Find an identical extended attribute block. * * Returns a pointer to the block found, or NULL if such a block was * not found or an error occurred. */ static struct buffer_head * ext4_xattr_block_cache_find(struct inode *inode, struct ext4_xattr_header *header, struct mb_cache_entry **pce) { __u32 hash = le32_to_cpu(header->h_hash); struct mb_cache_entry *ce; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); if (!ea_block_cache) return NULL; if (!header->h_hash) return NULL; /* never share */ ea_idebug(inode, "looking for cached blocks [%x]", (int)hash); ce = mb_cache_entry_find_first(ea_block_cache, hash); while (ce) { struct buffer_head *bh; bh = ext4_sb_bread(inode->i_sb, ce->e_value, REQ_PRIO); if (IS_ERR(bh)) { if (PTR_ERR(bh) == -ENOMEM) { mb_cache_entry_put(ea_block_cache, ce); return NULL; } bh = NULL; EXT4_ERROR_INODE(inode, "block %lu read error", (unsigned long)ce->e_value); } else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) { *pce = ce; return bh; } brelse(bh); ce = mb_cache_entry_find_next(ea_block_cache, ce); } return NULL; } #define NAME_HASH_SHIFT 5 #define VALUE_HASH_SHIFT 16 /* * ext4_xattr_hash_entry() * * Compute the hash of an extended attribute. */ static __le32 ext4_xattr_hash_entry(char *name, size_t name_len, __le32 *value, size_t value_count) { __u32 hash = 0; while (name_len--) { hash = (hash << NAME_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - NAME_HASH_SHIFT)) ^ *name++; } while (value_count--) { hash = (hash << VALUE_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - VALUE_HASH_SHIFT)) ^ le32_to_cpu(*value++); } return cpu_to_le32(hash); } #undef NAME_HASH_SHIFT #undef VALUE_HASH_SHIFT #define BLOCK_HASH_SHIFT 16 /* * ext4_xattr_rehash() * * Re-compute the extended attribute hash value after an entry has changed. */ static void ext4_xattr_rehash(struct ext4_xattr_header *header) { struct ext4_xattr_entry *here; __u32 hash = 0; here = ENTRY(header+1); while (!IS_LAST_ENTRY(here)) { if (!here->e_hash) { /* Block is not shared if an entry's hash value == 0 */ hash = 0; break; } hash = (hash << BLOCK_HASH_SHIFT) ^ (hash >> (8*sizeof(hash) - BLOCK_HASH_SHIFT)) ^ le32_to_cpu(here->e_hash); here = EXT4_XATTR_NEXT(here); } header->h_hash = cpu_to_le32(hash); } #undef BLOCK_HASH_SHIFT #define HASH_BUCKET_BITS 10 struct mb_cache * ext4_xattr_create_cache(void) { return mb_cache_create(HASH_BUCKET_BITS); } void ext4_xattr_destroy_cache(struct mb_cache *cache) { if (cache) mb_cache_destroy(cache); }
55 55 1 55 43 42 59 59 24 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 // SPDX-License-Identifier: GPL-2.0-or-later /* ar-skbuff.c: socket buffer destruction handling * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/net.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" #define is_tx_skb(skb) (rxrpc_skb(skb)->rx_flags & RXRPC_SKB_TX_BUFFER) #define select_skb_count(skb) (is_tx_skb(skb) ? &rxrpc_n_tx_skbs : &rxrpc_n_rx_skbs) /* * Note the allocation or reception of a socket buffer. */ void rxrpc_new_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, rxrpc_skb(skb)->rx_flags, here); } /* * Note the re-emergence of a socket buffer from a queue or buffer. */ void rxrpc_see_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); if (skb) { int n = atomic_read(select_skb_count(skb)); trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, rxrpc_skb(skb)->rx_flags, here); } } /* * Note the addition of a ref on a socket buffer. */ void rxrpc_get_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); int n = atomic_inc_return(select_skb_count(skb)); trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, rxrpc_skb(skb)->rx_flags, here); skb_get(skb); } /* * Note the dropping of a ref on a socket buffer by the core. */ void rxrpc_eaten_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); int n = atomic_inc_return(&rxrpc_n_rx_skbs); trace_rxrpc_skb(skb, op, 0, n, 0, here); } /* * Note the destruction of a socket buffer. */ void rxrpc_free_skb(struct sk_buff *skb, enum rxrpc_skb_trace op) { const void *here = __builtin_return_address(0); if (skb) { int n; n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, op, refcount_read(&skb->users), n, rxrpc_skb(skb)->rx_flags, here); kfree_skb(skb); } } /* * Clear a queue of socket buffers. */ void rxrpc_purge_queue(struct sk_buff_head *list) { const void *here = __builtin_return_address(0); struct sk_buff *skb; while ((skb = skb_dequeue((list))) != NULL) { int n = atomic_dec_return(select_skb_count(skb)); trace_rxrpc_skb(skb, rxrpc_skb_purged, refcount_read(&skb->users), n, rxrpc_skb(skb)->rx_flags, here); kfree_skb(skb); } }
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2008 by Karsten Keil <kkeil@novell.com> */ #include <linux/slab.h> #include <linux/types.h> #include <linux/stddef.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/mISDNif.h> #include "core.h" static u_int debug; MODULE_AUTHOR("Karsten Keil"); MODULE_LICENSE("GPL"); module_param(debug, uint, S_IRUGO | S_IWUSR); static u64 device_ids; #define MAX_DEVICE_ID 63 static LIST_HEAD(Bprotocols); static DEFINE_RWLOCK(bp_lock); static void mISDN_dev_release(struct device *dev) { /* nothing to do: the device is part of its parent's data structure */ } static ssize_t id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return -ENODEV; return sprintf(buf, "%d\n", mdev->id); } static DEVICE_ATTR_RO(id); static ssize_t nrbchan_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return -ENODEV; return sprintf(buf, "%d\n", mdev->nrbchan); } static DEVICE_ATTR_RO(nrbchan); static ssize_t d_protocols_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return -ENODEV; return sprintf(buf, "%d\n", mdev->Dprotocols); } static DEVICE_ATTR_RO(d_protocols); static ssize_t b_protocols_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return -ENODEV; return sprintf(buf, "%d\n", mdev->Bprotocols | get_all_Bprotocols()); } static DEVICE_ATTR_RO(b_protocols); static ssize_t protocol_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return -ENODEV; return sprintf(buf, "%d\n", mdev->D.protocol); } static DEVICE_ATTR_RO(protocol); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { strcpy(buf, dev_name(dev)); return strlen(buf); } static DEVICE_ATTR_RO(name); #if 0 /* hangs */ static ssize_t name_set(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int err = 0; char *out = kmalloc(count + 1, GFP_KERNEL); if (!out) return -ENOMEM; memcpy(out, buf, count); if (count && out[count - 1] == '\n') out[--count] = 0; if (count) err = device_rename(dev, out); kfree(out); return (err < 0) ? err : count; } static DEVICE_ATTR_RW(name); #endif static ssize_t channelmap_show(struct device *dev, struct device_attribute *attr, char *buf) { struct mISDNdevice *mdev = dev_to_mISDN(dev); char *bp = buf; int i; for (i = 0; i <= mdev->nrbchan; i++) *bp++ = test_channelmap(i, mdev->channelmap) ? '1' : '0'; return bp - buf; } static DEVICE_ATTR_RO(channelmap); static struct attribute *mISDN_attrs[] = { &dev_attr_id.attr, &dev_attr_d_protocols.attr, &dev_attr_b_protocols.attr, &dev_attr_protocol.attr, &dev_attr_channelmap.attr, &dev_attr_nrbchan.attr, &dev_attr_name.attr, NULL, }; ATTRIBUTE_GROUPS(mISDN); static int mISDN_uevent(struct device *dev, struct kobj_uevent_env *env) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return 0; if (add_uevent_var(env, "nchans=%d", mdev->nrbchan)) return -ENOMEM; return 0; } static void mISDN_class_release(struct class *cls) { /* do nothing, it's static */ } static struct class mISDN_class = { .name = "mISDN", .owner = THIS_MODULE, .dev_uevent = mISDN_uevent, .dev_groups = mISDN_groups, .dev_release = mISDN_dev_release, .class_release = mISDN_class_release, }; static int _get_mdevice(struct device *dev, const void *id) { struct mISDNdevice *mdev = dev_to_mISDN(dev); if (!mdev) return 0; if (mdev->id != *(const u_int *)id) return 0; return 1; } struct mISDNdevice *get_mdevice(u_int id) { return dev_to_mISDN(class_find_device(&mISDN_class, NULL, &id, _get_mdevice)); } static int _get_mdevice_count(struct device *dev, void *cnt) { *(int *)cnt += 1; return 0; } int get_mdevice_count(void) { int cnt = 0; class_for_each_device(&mISDN_class, NULL, &cnt, _get_mdevice_count); return cnt; } static int get_free_devid(void) { u_int i; for (i = 0; i <= MAX_DEVICE_ID; i++) if (!test_and_set_bit(i, (u_long *)&device_ids)) break; if (i > MAX_DEVICE_ID) return -EBUSY; return i; } int mISDN_register_device(struct mISDNdevice *dev, struct device *parent, char *name) { int err; err = get_free_devid(); if (err < 0) return err; dev->id = err; device_initialize(&dev->dev); if (name && name[0]) dev_set_name(&dev->dev, "%s", name); else dev_set_name(&dev->dev, "mISDN%d", dev->id); if (debug & DEBUG_CORE) printk(KERN_DEBUG "mISDN_register %s %d\n", dev_name(&dev->dev), dev->id); dev->dev.class = &mISDN_class; err = create_stack(dev); if (err) goto error1; dev->dev.platform_data = dev; dev->dev.parent = parent; dev_set_drvdata(&dev->dev, dev); err = device_add(&dev->dev); if (err) goto error3; return 0; error3: delete_stack(dev); error1: put_device(&dev->dev); return err; } EXPORT_SYMBOL(mISDN_register_device); void mISDN_unregister_device(struct mISDNdevice *dev) { if (debug & DEBUG_CORE) printk(KERN_DEBUG "mISDN_unregister %s %d\n", dev_name(&dev->dev), dev->id); /* sysfs_remove_link(&dev->dev.kobj, "device"); */ device_del(&dev->dev); dev_set_drvdata(&dev->dev, NULL); test_and_clear_bit(dev->id, (u_long *)&device_ids); delete_stack(dev); put_device(&dev->dev); } EXPORT_SYMBOL(mISDN_unregister_device); u_int get_all_Bprotocols(void) { struct Bprotocol *bp; u_int m = 0; read_lock(&bp_lock); list_for_each_entry(bp, &Bprotocols, list) m |= bp->Bprotocols; read_unlock(&bp_lock); return m; } struct Bprotocol * get_Bprotocol4mask(u_int m) { struct Bprotocol *bp; read_lock(&bp_lock); list_for_each_entry(bp, &Bprotocols, list) if (bp->Bprotocols & m) { read_unlock(&bp_lock); return bp; } read_unlock(&bp_lock); return NULL; } struct Bprotocol * get_Bprotocol4id(u_int id) { u_int m; if (id < ISDN_P_B_START || id > 63) { printk(KERN_WARNING "%s id not in range %d\n", __func__, id); return NULL; } m = 1 << (id & ISDN_P_B_MASK); return get_Bprotocol4mask(m); } int mISDN_register_Bprotocol(struct Bprotocol *bp) { u_long flags; struct Bprotocol *old; if (debug & DEBUG_CORE) printk(KERN_DEBUG "%s: %s/%x\n", __func__, bp->name, bp->Bprotocols); old = get_Bprotocol4mask(bp->Bprotocols); if (old) { printk(KERN_WARNING "register duplicate protocol old %s/%x new %s/%x\n", old->name, old->Bprotocols, bp->name, bp->Bprotocols); return -EBUSY; } write_lock_irqsave(&bp_lock, flags); list_add_tail(&bp->list, &Bprotocols); write_unlock_irqrestore(&bp_lock, flags); return 0; } EXPORT_SYMBOL(mISDN_register_Bprotocol); void mISDN_unregister_Bprotocol(struct Bprotocol *bp) { u_long flags; if (debug & DEBUG_CORE) printk(KERN_DEBUG "%s: %s/%x\n", __func__, bp->name, bp->Bprotocols); write_lock_irqsave(&bp_lock, flags); list_del(&bp->list); write_unlock_irqrestore(&bp_lock, flags); } EXPORT_SYMBOL(mISDN_unregister_Bprotocol); static const char *msg_no_channel = "<no channel>"; static const char *msg_no_stack = "<no stack>"; static const char *msg_no_stackdev = "<no stack device>"; const char *mISDNDevName4ch(struct mISDNchannel *ch) { if (!ch) return msg_no_channel; if (!ch->st) return msg_no_stack; if (!ch->st->dev) return msg_no_stackdev; return dev_name(&ch->st->dev->dev); }; EXPORT_SYMBOL(mISDNDevName4ch); static int mISDNInit(void) { int err; printk(KERN_INFO "Modular ISDN core version %d.%d.%d\n", MISDN_MAJOR_VERSION, MISDN_MINOR_VERSION, MISDN_RELEASE); mISDN_init_clock(&debug); mISDN_initstack(&debug); err = class_register(&mISDN_class); if (err) goto error1; err = mISDN_inittimer(&debug); if (err) goto error2; err = Isdnl1_Init(&debug); if (err) goto error3; err = Isdnl2_Init(&debug); if (err) goto error4; err = misdn_sock_init(&debug); if (err) goto error5; return 0; error5: Isdnl2_cleanup(); error4: Isdnl1_cleanup(); error3: mISDN_timer_cleanup(); error2: class_unregister(&mISDN_class); error1: return err; } static void mISDN_cleanup(void) { misdn_sock_cleanup(); Isdnl2_cleanup(); Isdnl1_cleanup(); mISDN_timer_cleanup(); class_unregister(&mISDN_class); printk(KERN_DEBUG "mISDNcore unloaded\n"); } module_init(mISDNInit); module_exit(mISDN_cleanup);
269 272 199 198 199 199 924 923 200 2 2 2 25 24 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for queueing packets and communicating with * userspace via nfnetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> * (C) 2007 by Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ip_queue.c: * (C) 2000-2002 James Morris <jmorris@intercode.com.au> * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/proc_fs.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_queue.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/list.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/netfilter/nf_queue.h> #include <net/netns/generic.h> #include <linux/atomic.h> #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #define NFQNL_QMAX_DEFAULT 1024 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len * includes the header length. Thus, the maximum packet length that we * support is 65531 bytes. We send truncated packets if the specified length * is larger than that. Userspace can check for presence of NFQA_CAP_LEN * attribute to detect truncation. */ #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ struct rcu_head rcu; u32 peer_portid; unsigned int queue_maxlen; unsigned int copy_range; unsigned int queue_dropped; unsigned int queue_user_dropped; u_int16_t queue_num; /* number of this queue */ u_int8_t copy_mode; u_int32_t flags; /* Set using NFQA_CFG_FLAGS */ /* * Following fields are dirtied for each queued packet, * keep them in same cache line if possible. */ spinlock_t lock ____cacheline_aligned_in_smp; unsigned int queue_total; unsigned int id_sequence; /* 'sequence' of pkt ids */ struct list_head queue_list; /* packets in queue */ }; typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); static unsigned int nfnl_queue_net_id __read_mostly; #define INSTANCE_BUCKETS 16 struct nfnl_queue_net { spinlock_t instances_lock; struct hlist_head instance_table[INSTANCE_BUCKETS]; }; static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) { return net_generic(net, nfnl_queue_net_id); } static inline u_int8_t instance_hashfn(u_int16_t queue_num) { return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } static struct nfqnl_instance * instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) { struct hlist_head *head; struct nfqnl_instance *inst; head = &q->instance_table[instance_hashfn(queue_num)]; hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->queue_num == queue_num) return inst; } return NULL; } static struct nfqnl_instance * instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) { struct nfqnl_instance *inst; unsigned int h; int err; spin_lock(&q->instances_lock); if (instance_lookup(q, queue_num)) { err = -EEXIST; goto out_unlock; } inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) { err = -ENOMEM; goto out_unlock; } inst->queue_num = queue_num; inst->peer_portid = portid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; inst->copy_range = NFQNL_MAX_COPY_RANGE; inst->copy_mode = NFQNL_COPY_NONE; spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); if (!try_module_get(THIS_MODULE)) { err = -EAGAIN; goto out_free; } h = instance_hashfn(queue_num); hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); spin_unlock(&q->instances_lock); return inst; out_free: kfree(inst); out_unlock: spin_unlock(&q->instances_lock); return ERR_PTR(err); } static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data); static void instance_destroy_rcu(struct rcu_head *head) { struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, rcu); rcu_read_lock(); nfqnl_flush(inst, NULL, 0); rcu_read_unlock(); kfree(inst); module_put(THIS_MODULE); } static void __instance_destroy(struct nfqnl_instance *inst) { hlist_del_rcu(&inst->hlist); call_rcu(&inst->rcu, instance_destroy_rcu); } static void instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) { spin_lock(&q->instances_lock); __instance_destroy(inst); spin_unlock(&q->instances_lock); } static inline void __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { list_add_tail(&entry->list, &queue->queue_list); queue->queue_total++; } static void __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { list_del(&entry->list); queue->queue_total--; } static struct nf_queue_entry * find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) { struct nf_queue_entry *entry = NULL, *i; spin_lock_bh(&queue->lock); list_for_each_entry(i, &queue->queue_list, list) { if (i->id == id) { entry = i; break; } } if (entry) __dequeue_entry(queue, entry); spin_unlock_bh(&queue->lock); return entry; } static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_ct_hook *ct_hook; int err; if (verdict == NF_ACCEPT || verdict == NF_REPEAT || verdict == NF_STOP) { rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) { err = ct_hook->update(entry->state.net, entry->skb); if (err < 0) verdict = NF_DROP; } rcu_read_unlock(); } nf_reinject(entry, verdict); } static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) { struct nf_queue_entry *entry, *next; spin_lock_bh(&queue->lock); list_for_each_entry_safe(entry, next, &queue->queue_list, list) { if (!cmpfn || cmpfn(entry, data)) { list_del(&entry->list); queue->queue_total--; nfqnl_reinject(entry, NF_DROP); } } spin_unlock_bh(&queue->lock); } static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, bool csum_verify) { __u32 flags = 0; if (packet->ip_summed == CHECKSUM_PARTIAL) flags = NFQA_SKB_CSUMNOTREADY; else if (csum_verify) flags = NFQA_SKB_CSUM_NOTVERIFIED; if (skb_is_gso(packet)) flags |= NFQA_SKB_GSO; return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; } static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) { const struct cred *cred; if (!sk_fullsock(sk)) return 0; read_lock_bh(&sk->sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { cred = sk->sk_socket->file->f_cred; if (nla_put_be32(skb, NFQA_UID, htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) goto nla_put_failure; if (nla_put_be32(skb, NFQA_GID, htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) goto nla_put_failure; } read_unlock_bh(&sk->sk_callback_lock); return 0; nla_put_failure: read_unlock_bh(&sk->sk_callback_lock); return -1; } static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) { u32 seclen = 0; #if IS_ENABLED(CONFIG_NETWORK_SECMARK) if (!skb || !sk_fullsock(skb->sk)) return 0; read_lock_bh(&skb->sk->sk_callback_lock); if (skb->secmark) security_secid_to_secctx(skb->secmark, secdata, &seclen); read_unlock_bh(&skb->sk->sk_callback_lock); #endif return seclen; } static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry) { struct sk_buff *entskb = entry->skb; u32 nlalen = 0; if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) return 0; if (skb_vlan_tag_present(entskb)) nlalen += nla_total_size(nla_total_size(sizeof(__be16)) + nla_total_size(sizeof(__be16))); if (entskb->network_header > entskb->mac_header) nlalen += nla_total_size((entskb->network_header - entskb->mac_header)); return nlalen; } static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb) { struct sk_buff *entskb = entry->skb; if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) return 0; if (skb_vlan_tag_present(entskb)) { struct nlattr *nest; nest = nla_nest_start(skb, NFQA_VLAN); if (!nest) goto nla_put_failure; if (nla_put_be16(skb, NFQA_VLAN_TCI, htons(entskb->vlan_tci)) || nla_put_be16(skb, NFQA_VLAN_PROTO, entskb->vlan_proto)) goto nla_put_failure; nla_nest_end(skb, nest); } if (entskb->mac_header < entskb->network_header) { int len = (int)(entskb->network_header - entskb->mac_header); if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, __be32 **packet_id_ptr) { size_t size; size_t data_len = 0, cap_len = 0; unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; struct nlmsghdr *nlh; struct sk_buff *entskb = entry->skb; struct net_device *indev; struct net_device *outdev; struct nf_conn *ct = NULL; enum ip_conntrack_info ctinfo = 0; const struct nfnl_ct_hook *nfnl_ct; bool csum_verify; char *secdata = NULL; u32 seclen = 0; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ + nla_total_size(sizeof(u_int32_t)); /* cap_len */ if (entskb->tstamp) size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); size += nfqnl_get_bridge_size(entry); if (entry->state.hook <= NF_INET_FORWARD || (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) csum_verify = !skb_csum_unnecessary(entskb); else csum_verify = false; outdev = entry->state.out; switch ((enum nfqnl_config_mode)READ_ONCE(queue->copy_mode)) { case NFQNL_COPY_META: case NFQNL_COPY_NONE: break; case NFQNL_COPY_PACKET: if (!(queue->flags & NFQA_CFG_F_GSO) && entskb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(entskb)) return NULL; data_len = READ_ONCE(queue->copy_range); if (data_len > entskb->len) data_len = entskb->len; hlen = skb_zerocopy_headlen(entskb); hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; break; } nfnl_ct = rcu_dereference(nfnl_ct_hook); #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (queue->flags & NFQA_CFG_F_CONNTRACK) { if (nfnl_ct != NULL) { ct = nf_ct_get(entskb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } #endif if (queue->flags & NFQA_CFG_F_UID_GID) { size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + nla_total_size(sizeof(u_int32_t))); /* gid */ } if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { seclen = nfqnl_get_sk_secctx(entskb, &secdata); if (seclen) size += nla_total_size(seclen); } skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { skb_tx_error(entskb); goto nlmsg_failure; } nlh = nfnl_msg_put(skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), 0, entry->state.pf, NFNETLINK_V0, htons(queue->queue_num)); if (!nlh) { skb_tx_error(entskb); kfree_skb(skb); goto nlmsg_failure; } nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); pmsg = nla_data(nla); pmsg->hw_protocol = entskb->protocol; pmsg->hook = entry->state.hook; *packet_id_ptr = &pmsg->packet_id; indev = entry->state.in; if (indev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; #else if (entry->state.pf == PF_BRIDGE) { /* Case 1: indev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, htonl(indev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by __nf_queue */ nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { int physinif; /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; physinif = nf_bridge_get_physinif(entskb); if (physinif && nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, htonl(physinif))) goto nla_put_failure; } #endif } if (outdev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; #else if (entry->state.pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by __nf_queue */ nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) goto nla_put_failure; } else { int physoutif; /* Case 2: outdev is bridge group, we need to look for * physical output device (when called from ipv4) */ if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; physoutif = nf_bridge_get_physoutif(entskb); if (physoutif && nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, htonl(physoutif))) goto nla_put_failure; } #endif } if (entskb->mark && nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) goto nla_put_failure; if (indev && entskb->dev && skb_mac_header_was_set(entskb) && skb_mac_header_len(entskb) != 0) { struct nfqnl_msg_packet_hw phw; int len; memset(&phw, 0, sizeof(phw)); len = dev_parse_header(entskb, phw.hw_addr); if (len) { phw.hw_addrlen = htons(len); if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) goto nla_put_failure; } } if (nfqnl_put_bridge(entry, skb) < 0) goto nla_put_failure; if (entry->state.hook <= NF_INET_FORWARD && entskb->tstamp) { struct nfqnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(entskb->tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts)) goto nla_put_failure; } if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) goto nla_put_failure; if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata)) goto nla_put_failure; if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0) goto nla_put_failure; if (cap_len > data_len && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) goto nla_put_failure; if (nfqnl_put_packet_info(skb, entskb, csum_verify)) goto nla_put_failure; if (data_len) { struct nlattr *nla; if (skb_tailroom(skb) < sizeof(*nla) + hlen) goto nla_put_failure; nla = skb_put(skb, sizeof(*nla)); nla->nla_type = NFQA_PAYLOAD; nla->nla_len = nla_attr_size(data_len); if (skb_zerocopy(skb, entskb, data_len, hlen)) goto nla_put_failure; } nlh->nlmsg_len = skb->len; if (seclen) security_release_secctx(secdata, seclen); return skb; nla_put_failure: skb_tx_error(entskb); kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); nlmsg_failure: if (seclen) security_release_secctx(secdata, seclen); return NULL; } static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; struct nf_conn *ct = (void *)skb_nfct(entry->skb); unsigned long status; unsigned int use; if (!ct) return false; status = READ_ONCE(ct->status); if ((status & flags) == IPS_DYING) return true; if (status & IPS_CONFIRMED) return false; /* in some cases skb_clone() can occur after initial conntrack * pickup, but conntrack assumes exclusive skb->_nfct ownership for * unconfirmed entries. * * This happens for br_netfilter and with ip multicast routing. * We can't be solved with serialization here because one clone could * have been queued for local delivery. */ use = refcount_read(&ct->ct_general.use); if (likely(use == 1)) return false; /* Can't decrement further? Exclusive ownership. */ if (!refcount_dec_not_one(&ct->ct_general.use)) return false; skb_set_nfct(entry->skb, 0); /* No nf_ct_put(): we already decremented .use and it cannot * drop down to 0. */ return true; #endif return false; } static int __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry) { struct sk_buff *nskb; int err = -ENOBUFS; __be32 *packet_id_ptr; int failopen = 0; nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr); if (nskb == NULL) { err = -ENOMEM; goto err_out; } spin_lock_bh(&queue->lock); if (nf_ct_drop_unconfirmed(entry)) goto err_out_free_nskb; if (queue->queue_total >= queue->queue_maxlen) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; err = 0; } else { queue->queue_dropped++; net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", queue->queue_total); } goto err_out_free_nskb; } entry->id = ++queue->id_sequence; *packet_id_ptr = htonl(entry->id); /* nfnetlink_unicast will either free the nskb or add it to a socket */ err = nfnetlink_unicast(nskb, net, queue->peer_portid); if (err < 0) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; err = 0; } else { queue->queue_user_dropped++; } goto err_out_unlock; } __enqueue_entry(queue, entry); spin_unlock_bh(&queue->lock); return 0; err_out_free_nskb: kfree_skb(nskb); err_out_unlock: spin_unlock_bh(&queue->lock); if (failopen) nfqnl_reinject(entry, NF_ACCEPT); err_out: return err; } static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); if (!entry) return NULL; if (nf_queue_entry_get_refs(entry)) return entry; kfree(entry); return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* When called from bridge netfilter, skb->data must point to MAC header * before calling skb_gso_segment(). Else, original MAC header is lost * and segmented skbs will be sent to wrong destination. */ static void nf_bridge_adjust_skb_data(struct sk_buff *skb) { if (nf_bridge_info_get(skb)) __skb_push(skb, skb->network_header - skb->mac_header); } static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) { if (nf_bridge_info_get(skb)) __skb_pull(skb, skb->network_header - skb->mac_header); } #else #define nf_bridge_adjust_skb_data(s) do {} while (0) #define nf_bridge_adjust_segmented_data(s) do {} while (0) #endif static int __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, struct sk_buff *skb, struct nf_queue_entry *entry) { int ret = -ENOMEM; struct nf_queue_entry *entry_seg; nf_bridge_adjust_segmented_data(skb); if (skb->next == NULL) { /* last packet, no need to copy entry */ struct sk_buff *gso_skb = entry->skb; entry->skb = skb; ret = __nfqnl_enqueue_packet(net, queue, entry); if (ret) entry->skb = gso_skb; return ret; } skb_mark_not_on_list(skb); entry_seg = nf_queue_entry_dup(entry); if (entry_seg) { entry_seg->skb = skb; ret = __nfqnl_enqueue_packet(net, queue, entry_seg); if (ret) nf_queue_entry_free(entry_seg); } return ret; } static int nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { unsigned int queued; struct nfqnl_instance *queue; struct sk_buff *skb, *segs, *nskb; int err = -ENOBUFS; struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); /* rcu_read_lock()ed by nf_hook_thresh */ queue = instance_lookup(q, queuenum); if (!queue) return -ESRCH; if (queue->copy_mode == NFQNL_COPY_NONE) return -EINVAL; skb = entry->skb; switch (entry->state.pf) { case NFPROTO_IPV4: skb->protocol = htons(ETH_P_IP); break; case NFPROTO_IPV6: skb->protocol = htons(ETH_P_IPV6); break; } if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) return __nfqnl_enqueue_packet(net, queue, entry); nf_bridge_adjust_skb_data(skb); segs = skb_gso_segment(skb, 0); /* Does not use PTR_ERR to limit the number of error codes that can be * returned by nf_queue. For instance, callers rely on -ESRCH to * mean 'ignore this hook'. */ if (IS_ERR_OR_NULL(segs)) goto out_err; queued = 0; err = 0; skb_list_walk_safe(segs, segs, nskb) { if (err == 0) err = __nfqnl_enqueue_packet_gso(net, queue, segs, entry); if (err == 0) queued++; else kfree_skb(segs); } if (queued) { if (err) /* some segments are already queued */ nf_queue_entry_free(entry); kfree_skb(skb); return 0; } out_err: nf_bridge_adjust_segmented_data(skb); return err; } static int nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff) { struct sk_buff *nskb; if (diff < 0) { unsigned int min_len = skb_transport_offset(e->skb); if (data_len < min_len) return -EINVAL; if (pskb_trim(e->skb, data_len)) return -ENOMEM; } else if (diff > 0) { if (data_len > 0xFFFF) return -EINVAL; if (diff > skb_tailroom(e->skb)) { nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), diff, GFP_ATOMIC); if (!nskb) return -ENOMEM; kfree_skb(e->skb); e->skb = nskb; } skb_put(e->skb, diff); } if (skb_ensure_writable(e->skb, data_len)) return -ENOMEM; skb_copy_to_linear_data(e->skb, data, data_len); e->skb->ip_summed = CHECKSUM_NONE; return 0; } static int nfqnl_set_mode(struct nfqnl_instance *queue, unsigned char mode, unsigned int range) { int status = 0; spin_lock_bh(&queue->lock); switch (mode) { case NFQNL_COPY_NONE: case NFQNL_COPY_META: queue->copy_mode = mode; queue->copy_range = 0; break; case NFQNL_COPY_PACKET: queue->copy_mode = mode; if (range == 0 || range > NFQNL_MAX_COPY_RANGE) queue->copy_range = NFQNL_MAX_COPY_RANGE; else queue->copy_range = range; break; default: status = -EINVAL; } spin_unlock_bh(&queue->lock); return status; } static int dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int physinif, physoutif; physinif = nf_bridge_get_physinif(entry->skb); physoutif = nf_bridge_get_physoutif(entry->skb); if (physinif == ifindex || physoutif == ifindex) return 1; #endif if (entry->state.in) if (entry->state.in->ifindex == ifindex) return 1; if (entry->state.out) if (entry->state.out->ifindex == ifindex) return 1; return 0; } /* drop all packets with either indev or outdev == ifindex from all queue * instances */ static void nfqnl_dev_drop(struct net *net, int ifindex) { int i; struct nfnl_queue_net *q = nfnl_queue_pernet(net); rcu_read_lock(); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, dev_cmp, ifindex); } rcu_read_unlock(); } static int nfqnl_rcv_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) nfqnl_dev_drop(dev_net(dev), dev->ifindex); return NOTIFY_DONE; } static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; static void nfqnl_nf_hook_drop(struct net *net) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); int i; /* This function is also called on net namespace error unwind, * when pernet_ops->init() failed and ->exit() functions of the * previous pernet_ops gets called. * * This may result in a call to nfqnl_nf_hook_drop() before * struct nfnl_queue_net was allocated. */ if (!q) return; for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, NULL, 0); } } static int nfqnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this portid */ spin_lock(&q->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *t2; struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_safe(inst, t2, head, hlist) { if (n->portid == inst->peer_portid) __instance_destroy(inst); } } spin_unlock(&q->instances_lock); } return NOTIFY_DONE; } static struct notifier_block nfqnl_rtnl_notifier = { .notifier_call = nfqnl_rcv_nl_event, }; static const struct nla_policy nfqa_vlan_policy[NFQA_VLAN_MAX + 1] = { [NFQA_VLAN_TCI] = { .type = NLA_U16}, [NFQA_VLAN_PROTO] = { .type = NLA_U16}, }; static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, [NFQA_CT] = { .type = NLA_UNSPEC }, [NFQA_EXP] = { .type = NLA_UNSPEC }, [NFQA_VLAN] = { .type = NLA_NESTED }, }; static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, }; static struct nfqnl_instance * verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid) { struct nfqnl_instance *queue; queue = instance_lookup(q, queue_num); if (!queue) return ERR_PTR(-ENODEV); if (queue->peer_portid != nlportid) return ERR_PTR(-EPERM); return queue; } static struct nfqnl_msg_verdict_hdr* verdicthdr_get(const struct nlattr * const nfqa[]) { struct nfqnl_msg_verdict_hdr *vhdr; unsigned int verdict; if (!nfqa[NFQA_VERDICT_HDR]) return NULL; vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) return NULL; return vhdr; } static int nfq_id_after(unsigned int id, unsigned int max) { return (int)(id - max) > 0; } static int nfqnl_recv_verdict_batch(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u16 queue_num = ntohs(info->nfmsg->res_id); struct nf_queue_entry *entry, *tmp; struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; unsigned int verdict, maxid; LIST_HEAD(batch_list); queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); vhdr = verdicthdr_get(nfqa); if (!vhdr) return -EINVAL; verdict = ntohl(vhdr->verdict); maxid = ntohl(vhdr->id); spin_lock_bh(&queue->lock); list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) { if (nfq_id_after(entry->id, maxid)) break; __dequeue_entry(queue, entry); list_add_tail(&entry->list, &batch_list); } spin_unlock_bh(&queue->lock); if (list_empty(&batch_list)) return -ENOENT; list_for_each_entry_safe(entry, tmp, &batch_list, list) { if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); nfqnl_reinject(entry, verdict); } return 0; } static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct, const struct nlmsghdr *nlh, const struct nlattr * const nfqa[], struct nf_queue_entry *entry, enum ip_conntrack_info *ctinfo) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conn *ct; ct = nf_ct_get(entry->skb, ctinfo); if (ct == NULL) return NULL; if (nfnl_ct->parse(nfqa[NFQA_CT], ct) < 0) return NULL; if (nfqa[NFQA_EXP]) nfnl_ct->attach_expect(nfqa[NFQA_EXP], ct, NETLINK_CB(entry->skb).portid, nlmsg_report(nlh)); return ct; #else return NULL; #endif } static int nfqa_parse_bridge(struct nf_queue_entry *entry, const struct nlattr * const nfqa[]) { if (nfqa[NFQA_VLAN]) { struct nlattr *tb[NFQA_VLAN_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFQA_VLAN_MAX, nfqa[NFQA_VLAN], nfqa_vlan_policy, NULL); if (err < 0) return err; if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO]) return -EINVAL; __vlan_hwaccel_put_tag(entry->skb, nla_get_be16(tb[NFQA_VLAN_PROTO]), ntohs(nla_get_be16(tb[NFQA_VLAN_TCI]))); } if (nfqa[NFQA_L2HDR]) { int mac_header_len = entry->skb->network_header - entry->skb->mac_header; if (mac_header_len != nla_len(nfqa[NFQA_L2HDR])) return -EINVAL; else if (mac_header_len > 0) memcpy(skb_mac_header(entry->skb), nla_data(nfqa[NFQA_L2HDR]), mac_header_len); } return 0; } static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u_int16_t queue_num = ntohs(info->nfmsg->res_id); const struct nfnl_ct_hook *nfnl_ct; struct nfqnl_msg_verdict_hdr *vhdr; enum ip_conntrack_info ctinfo; struct nfqnl_instance *queue; struct nf_queue_entry *entry; struct nf_conn *ct = NULL; unsigned int verdict; int err; queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); vhdr = verdicthdr_get(nfqa); if (!vhdr) return -EINVAL; verdict = ntohl(vhdr->verdict); entry = find_dequeue_entry(queue, ntohl(vhdr->id)); if (entry == NULL) return -ENOENT; /* rcu lock already held from nfnl->call_rcu. */ nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfqa[NFQA_CT]) { if (nfnl_ct != NULL) ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry, &ctinfo); } if (entry->state.pf == PF_BRIDGE) { err = nfqa_parse_bridge(entry, nfqa); if (err < 0) return err; } if (nfqa[NFQA_PAYLOAD]) { u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); int diff = payload_len - entry->skb->len; if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), payload_len, entry, diff) < 0) verdict = NF_DROP; if (ct && diff) nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff); } if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); nfqnl_reinject(entry, verdict); return 0; } static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { return -ENOTSUPP; } static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 }, [NFQA_CFG_MASK] = { .type = NLA_U32 }, [NFQA_CFG_FLAGS] = { .type = NLA_U32 }, }; static const struct nf_queue_handler nfqh = { .outfn = nfqnl_enqueue_packet, .nf_hook_drop = nfqnl_nf_hook_drop, }; static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u_int16_t queue_num = ntohs(info->nfmsg->res_id); struct nfqnl_msg_config_cmd *cmd = NULL; struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; int ret = 0; if (nfqa[NFQA_CFG_CMD]) { cmd = nla_data(nfqa[NFQA_CFG_CMD]); /* Obsolete commands without queue context */ switch (cmd->command) { case NFQNL_CFG_CMD_PF_BIND: return 0; case NFQNL_CFG_CMD_PF_UNBIND: return 0; } } /* Check if we support these flags in first place, dependencies should * be there too not to break atomicity. */ if (nfqa[NFQA_CFG_FLAGS]) { if (!nfqa[NFQA_CFG_MASK]) { /* A mask is needed to specify which flags are being * changed. */ return -EINVAL; } flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); if (flags >= NFQA_CFG_F_MAX) return -EOPNOTSUPP; #if !IS_ENABLED(CONFIG_NETWORK_SECMARK) if (flags & mask & NFQA_CFG_F_SECCTX) return -EOPNOTSUPP; #endif if ((flags & mask & NFQA_CFG_F_CONNTRACK) && !rcu_access_pointer(nfnl_ct_hook)) { #ifdef CONFIG_MODULES nfnl_unlock(NFNL_SUBSYS_QUEUE); request_module("ip_conntrack_netlink"); nfnl_lock(NFNL_SUBSYS_QUEUE); if (rcu_access_pointer(nfnl_ct_hook)) return -EAGAIN; #endif return -EOPNOTSUPP; } } rcu_read_lock(); queue = instance_lookup(q, queue_num); if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto err_out_unlock; } if (cmd != NULL) { switch (cmd->command) { case NFQNL_CFG_CMD_BIND: if (queue) { ret = -EBUSY; goto err_out_unlock; } queue = instance_create(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) { ret = PTR_ERR(queue); goto err_out_unlock; } break; case NFQNL_CFG_CMD_UNBIND: if (!queue) { ret = -ENODEV; goto err_out_unlock; } instance_destroy(q, queue); goto err_out_unlock; case NFQNL_CFG_CMD_PF_BIND: case NFQNL_CFG_CMD_PF_UNBIND: break; default: ret = -ENOTSUPP; goto err_out_unlock; } } if (!queue) { ret = -ENODEV; goto err_out_unlock; } if (nfqa[NFQA_CFG_PARAMS]) { struct nfqnl_msg_config_params *params = nla_data(nfqa[NFQA_CFG_PARAMS]); nfqnl_set_mode(queue, params->copy_mode, ntohl(params->copy_range)); } if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { __be32 *queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); spin_lock_bh(&queue->lock); queue->queue_maxlen = ntohl(*queue_maxlen); spin_unlock_bh(&queue->lock); } if (nfqa[NFQA_CFG_FLAGS]) { spin_lock_bh(&queue->lock); queue->flags &= ~mask; queue->flags |= flags & mask; spin_unlock_bh(&queue->lock); } err_out_unlock: rcu_read_unlock(); return ret; } static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, }, [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, .policy = nfqa_verdict_policy }, [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, .type = NFNL_CB_MUTEX, .attr_count = NFQA_CFG_MAX, .policy = nfqa_cfg_policy }, [NFQNL_MSG_VERDICT_BATCH] = { .call = nfqnl_recv_verdict_batch, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, .policy = nfqa_verdict_batch_policy }, }; static const struct nfnetlink_subsystem nfqnl_subsys = { .name = "nf_queue", .subsys_id = NFNL_SUBSYS_QUEUE, .cb_count = NFQNL_MSG_MAX, .cb = nfqnl_cb, }; #ifdef CONFIG_PROC_FS struct iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *get_first(struct seq_file *seq) { struct iter_state *st = seq->private; struct net *net; struct nfnl_queue_net *q; if (!st) return NULL; net = seq_file_net(seq); q = nfnl_queue_pernet(net); for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { if (!hlist_empty(&q->instance_table[st->bucket])) return q->instance_table[st->bucket].first; } return NULL; } static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) { struct iter_state *st = seq->private; struct net *net = seq_file_net(seq); h = h->next; while (!h) { struct nfnl_queue_net *q; if (++st->bucket >= INSTANCE_BUCKETS) return NULL; q = nfnl_queue_pernet(net); h = q->instance_table[st->bucket].first; } return h; } static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) { struct hlist_node *head; head = get_first(seq); if (head) while (pos && (head = get_next(seq, head))) pos--; return pos ? NULL : head; } static void *seq_start(struct seq_file *s, loff_t *pos) __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); return get_idx(s, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; return get_next(s, v); } static void seq_stop(struct seq_file *s, void *v) __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); } static int seq_show(struct seq_file *s, void *v) { const struct nfqnl_instance *inst = v; seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n", inst->queue_num, inst->peer_portid, inst->queue_total, inst->copy_mode, inst->copy_range, inst->queue_dropped, inst->queue_user_dropped, inst->id_sequence, 1); return 0; } static const struct seq_operations nfqnl_seq_ops = { .start = seq_start, .next = seq_next, .stop = seq_stop, .show = seq_show, }; #endif /* PROC_FS */ static int __net_init nfnl_queue_net_init(struct net *net) { unsigned int i; struct nfnl_queue_net *q = nfnl_queue_pernet(net); for (i = 0; i < INSTANCE_BUCKETS; i++) INIT_HLIST_HEAD(&q->instance_table[i]); spin_lock_init(&q->instances_lock); #ifdef CONFIG_PROC_FS if (!proc_create_net("nfnetlink_queue", 0440, net->nf.proc_netfilter, &nfqnl_seq_ops, sizeof(struct iter_state))) return -ENOMEM; #endif return 0; } static void __net_exit nfnl_queue_net_exit(struct net *net) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); unsigned int i; #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif for (i = 0; i < INSTANCE_BUCKETS; i++) WARN_ON_ONCE(!hlist_empty(&q->instance_table[i])); } static void nfnl_queue_net_exit_batch(struct list_head *net_exit_list) { synchronize_rcu(); } static struct pernet_operations nfnl_queue_net_ops = { .init = nfnl_queue_net_init, .exit = nfnl_queue_net_exit, .exit_batch = nfnl_queue_net_exit_batch, .id = &nfnl_queue_net_id, .size = sizeof(struct nfnl_queue_net), }; static int __init nfnetlink_queue_init(void) { int status; status = register_pernet_subsys(&nfnl_queue_net_ops); if (status < 0) { pr_err("failed to register pernet ops\n"); goto out; } netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); if (status < 0) { pr_err("failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = register_netdevice_notifier(&nfqnl_dev_notifier); if (status < 0) { pr_err("failed to register netdevice notifier\n"); goto cleanup_netlink_subsys; } nf_register_queue_handler(&nfqh); return status; cleanup_netlink_subsys: nfnetlink_subsys_unregister(&nfqnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); out: return status; } static void __exit nfnetlink_queue_fini(void) { nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_DESCRIPTION("netfilter packet queue handler"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); module_init(nfnetlink_queue_init); module_exit(nfnetlink_queue_fini);
36 38 6 31 32 32 32 32 32 32 32 32 32 32 32 26 4 38 38 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010-2013 Felix Fietkau <nbd@openwrt.org> * Copyright (C) 2019-2020 Intel Corporation */ #include <linux/netdevice.h> #include <linux/types.h> #include <linux/skbuff.h> #include <linux/debugfs.h> #include <linux/random.h> #include <linux/moduleparam.h> #include <linux/ieee80211.h> #include <net/mac80211.h> #include "rate.h" #include "sta_info.h" #include "rc80211_minstrel_ht.h" #define AVG_AMPDU_SIZE 16 #define AVG_PKT_SIZE 1200 #define SAMPLE_SWITCH_THR 100 /* Number of bits for an average sized packet */ #define MCS_NBITS ((AVG_PKT_SIZE * AVG_AMPDU_SIZE) << 3) /* Number of symbols for a packet with (bps) bits per symbol */ #define MCS_NSYMS(bps) DIV_ROUND_UP(MCS_NBITS, (bps)) /* Transmission time (nanoseconds) for a packet containing (syms) symbols */ #define MCS_SYMBOL_TIME(sgi, syms) \ (sgi ? \ ((syms) * 18000 + 4000) / 5 : /* syms * 3.6 us */ \ ((syms) * 1000) << 2 /* syms * 4 us */ \ ) /* Transmit duration for the raw data part of an average sized packet */ #define MCS_DURATION(streams, sgi, bps) \ (MCS_SYMBOL_TIME(sgi, MCS_NSYMS((streams) * (bps))) / AVG_AMPDU_SIZE) #define BW_20 0 #define BW_40 1 #define BW_80 2 /* * Define group sort order: HT40 -> SGI -> #streams */ #define GROUP_IDX(_streams, _sgi, _ht40) \ MINSTREL_HT_GROUP_0 + \ MINSTREL_MAX_STREAMS * 2 * _ht40 + \ MINSTREL_MAX_STREAMS * _sgi + \ _streams - 1 #define _MAX(a, b) (((a)>(b))?(a):(b)) #define GROUP_SHIFT(duration) \ _MAX(0, 16 - __builtin_clz(duration)) /* MCS rate information for an MCS group */ #define __MCS_GROUP(_streams, _sgi, _ht40, _s) \ [GROUP_IDX(_streams, _sgi, _ht40)] = { \ .streams = _streams, \ .shift = _s, \ .bw = _ht40, \ .flags = \ IEEE80211_TX_RC_MCS | \ (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ (_ht40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \ .duration = { \ MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 108 : 52) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 162 : 78) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 216 : 104) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 324 : 156) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 432 : 208) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 486 : 234) >> _s, \ MCS_DURATION(_streams, _sgi, _ht40 ? 540 : 260) >> _s \ } \ } #define MCS_GROUP_SHIFT(_streams, _sgi, _ht40) \ GROUP_SHIFT(MCS_DURATION(_streams, _sgi, _ht40 ? 54 : 26)) #define MCS_GROUP(_streams, _sgi, _ht40) \ __MCS_GROUP(_streams, _sgi, _ht40, \ MCS_GROUP_SHIFT(_streams, _sgi, _ht40)) #define VHT_GROUP_IDX(_streams, _sgi, _bw) \ (MINSTREL_VHT_GROUP_0 + \ MINSTREL_MAX_STREAMS * 2 * (_bw) + \ MINSTREL_MAX_STREAMS * (_sgi) + \ (_streams) - 1) #define BW2VBPS(_bw, r3, r2, r1) \ (_bw == BW_80 ? r3 : _bw == BW_40 ? r2 : r1) #define __VHT_GROUP(_streams, _sgi, _bw, _s) \ [VHT_GROUP_IDX(_streams, _sgi, _bw)] = { \ .streams = _streams, \ .shift = _s, \ .bw = _bw, \ .flags = \ IEEE80211_TX_RC_VHT_MCS | \ (_sgi ? IEEE80211_TX_RC_SHORT_GI : 0) | \ (_bw == BW_80 ? IEEE80211_TX_RC_80_MHZ_WIDTH : \ _bw == BW_40 ? IEEE80211_TX_RC_40_MHZ_WIDTH : 0), \ .duration = { \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 117, 54, 26)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 234, 108, 52)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 351, 162, 78)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 468, 216, 104)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 702, 324, 156)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 936, 432, 208)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 1053, 486, 234)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 1170, 540, 260)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 1404, 648, 312)) >> _s, \ MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 1560, 720, 346)) >> _s \ } \ } #define VHT_GROUP_SHIFT(_streams, _sgi, _bw) \ GROUP_SHIFT(MCS_DURATION(_streams, _sgi, \ BW2VBPS(_bw, 117, 54, 26))) #define VHT_GROUP(_streams, _sgi, _bw) \ __VHT_GROUP(_streams, _sgi, _bw, \ VHT_GROUP_SHIFT(_streams, _sgi, _bw)) #define CCK_DURATION(_bitrate, _short) \ (1000 * (10 /* SIFS */ + \ (_short ? 72 + 24 : 144 + 48) + \ (8 * (AVG_PKT_SIZE + 4) * 10) / (_bitrate))) #define CCK_DURATION_LIST(_short, _s) \ CCK_DURATION(10, _short) >> _s, \ CCK_DURATION(20, _short) >> _s, \ CCK_DURATION(55, _short) >> _s, \ CCK_DURATION(110, _short) >> _s #define __CCK_GROUP(_s) \ [MINSTREL_CCK_GROUP] = { \ .streams = 1, \ .flags = 0, \ .shift = _s, \ .duration = { \ CCK_DURATION_LIST(false, _s), \ CCK_DURATION_LIST(true, _s) \ } \ } #define CCK_GROUP_SHIFT \ GROUP_SHIFT(CCK_DURATION(10, false)) #define CCK_GROUP __CCK_GROUP(CCK_GROUP_SHIFT) #define OFDM_DURATION(_bitrate) \ (1000 * (16 /* SIFS + signal ext */ + \ 16 /* T_PREAMBLE */ + \ 4 /* T_SIGNAL */ + \ 4 * (((16 + 80 * (AVG_PKT_SIZE + 4) + 6) / \ ((_bitrate) * 4))))) #define OFDM_DURATION_LIST(_s) \ OFDM_DURATION(60) >> _s, \ OFDM_DURATION(90) >> _s, \ OFDM_DURATION(120) >> _s, \ OFDM_DURATION(180) >> _s, \ OFDM_DURATION(240) >> _s, \ OFDM_DURATION(360) >> _s, \ OFDM_DURATION(480) >> _s, \ OFDM_DURATION(540) >> _s #define __OFDM_GROUP(_s) \ [MINSTREL_OFDM_GROUP] = { \ .streams = 1, \ .flags = 0, \ .shift = _s, \ .duration = { \ OFDM_DURATION_LIST(_s), \ } \ } #define OFDM_GROUP_SHIFT \ GROUP_SHIFT(OFDM_DURATION(60)) #define OFDM_GROUP __OFDM_GROUP(OFDM_GROUP_SHIFT) static bool minstrel_vht_only = true; module_param(minstrel_vht_only, bool, 0644); MODULE_PARM_DESC(minstrel_vht_only, "Use only VHT rates when VHT is supported by sta."); /* * To enable sufficiently targeted rate sampling, MCS rates are divided into * groups, based on the number of streams and flags (HT40, SGI) that they * use. * * Sortorder has to be fixed for GROUP_IDX macro to be applicable: * BW -> SGI -> #streams */ const struct mcs_group minstrel_mcs_groups[] = { MCS_GROUP(1, 0, BW_20), MCS_GROUP(2, 0, BW_20), MCS_GROUP(3, 0, BW_20), MCS_GROUP(4, 0, BW_20), MCS_GROUP(1, 1, BW_20), MCS_GROUP(2, 1, BW_20), MCS_GROUP(3, 1, BW_20), MCS_GROUP(4, 1, BW_20), MCS_GROUP(1, 0, BW_40), MCS_GROUP(2, 0, BW_40), MCS_GROUP(3, 0, BW_40), MCS_GROUP(4, 0, BW_40), MCS_GROUP(1, 1, BW_40), MCS_GROUP(2, 1, BW_40), MCS_GROUP(3, 1, BW_40), MCS_GROUP(4, 1, BW_40), CCK_GROUP, OFDM_GROUP, VHT_GROUP(1, 0, BW_20), VHT_GROUP(2, 0, BW_20), VHT_GROUP(3, 0, BW_20), VHT_GROUP(4, 0, BW_20), VHT_GROUP(1, 1, BW_20), VHT_GROUP(2, 1, BW_20), VHT_GROUP(3, 1, BW_20), VHT_GROUP(4, 1, BW_20), VHT_GROUP(1, 0, BW_40), VHT_GROUP(2, 0, BW_40), VHT_GROUP(3, 0, BW_40), VHT_GROUP(4, 0, BW_40), VHT_GROUP(1, 1, BW_40), VHT_GROUP(2, 1, BW_40), VHT_GROUP(3, 1, BW_40), VHT_GROUP(4, 1, BW_40), VHT_GROUP(1, 0, BW_80), VHT_GROUP(2, 0, BW_80), VHT_GROUP(3, 0, BW_80), VHT_GROUP(4, 0, BW_80), VHT_GROUP(1, 1, BW_80), VHT_GROUP(2, 1, BW_80), VHT_GROUP(3, 1, BW_80), VHT_GROUP(4, 1, BW_80), }; const s16 minstrel_cck_bitrates[4] = { 10, 20, 55, 110 }; const s16 minstrel_ofdm_bitrates[8] = { 60, 90, 120, 180, 240, 360, 480, 540 }; static u8 sample_table[SAMPLE_COLUMNS][MCS_GROUP_RATES] __read_mostly; static const u8 minstrel_sample_seq[] = { MINSTREL_SAMPLE_TYPE_INC, MINSTREL_SAMPLE_TYPE_JUMP, MINSTREL_SAMPLE_TYPE_INC, MINSTREL_SAMPLE_TYPE_JUMP, MINSTREL_SAMPLE_TYPE_INC, MINSTREL_SAMPLE_TYPE_SLOW, }; static void minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi); /* * Some VHT MCSes are invalid (when Ndbps / Nes is not an integer) * e.g for MCS9@20MHzx1Nss: Ndbps=8x52*(5/6) Nes=1 * * Returns the valid mcs map for struct minstrel_mcs_group_data.supported */ static u16 minstrel_get_valid_vht_rates(int bw, int nss, __le16 mcs_map) { u16 mask = 0; if (bw == BW_20) { if (nss != 3 && nss != 6) mask = BIT(9); } else if (bw == BW_80) { if (nss == 3 || nss == 7) mask = BIT(6); else if (nss == 6) mask = BIT(9); } else { WARN_ON(bw != BW_40); } switch ((le16_to_cpu(mcs_map) >> (2 * (nss - 1))) & 3) { case IEEE80211_VHT_MCS_SUPPORT_0_7: mask |= 0x300; break; case IEEE80211_VHT_MCS_SUPPORT_0_8: mask |= 0x200; break; case IEEE80211_VHT_MCS_SUPPORT_0_9: break; default: mask = 0x3ff; } return 0x3ff & ~mask; } static bool minstrel_ht_is_legacy_group(int group) { return group == MINSTREL_CCK_GROUP || group == MINSTREL_OFDM_GROUP; } /* * Look up an MCS group index based on mac80211 rate information */ static int minstrel_ht_get_group_idx(struct ieee80211_tx_rate *rate) { return GROUP_IDX((rate->idx / 8) + 1, !!(rate->flags & IEEE80211_TX_RC_SHORT_GI), !!(rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH)); } static int minstrel_vht_get_group_idx(struct ieee80211_tx_rate *rate) { return VHT_GROUP_IDX(ieee80211_rate_get_vht_nss(rate), !!(rate->flags & IEEE80211_TX_RC_SHORT_GI), !!(rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) + 2*!!(rate->flags & IEEE80211_TX_RC_80_MHZ_WIDTH)); } static struct minstrel_rate_stats * minstrel_ht_get_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_tx_rate *rate) { int group, idx; if (rate->flags & IEEE80211_TX_RC_MCS) { group = minstrel_ht_get_group_idx(rate); idx = rate->idx % 8; goto out; } if (rate->flags & IEEE80211_TX_RC_VHT_MCS) { group = minstrel_vht_get_group_idx(rate); idx = ieee80211_rate_get_vht_mcs(rate); goto out; } group = MINSTREL_CCK_GROUP; for (idx = 0; idx < ARRAY_SIZE(mp->cck_rates); idx++) { if (!(mi->supported[group] & BIT(idx))) continue; if (rate->idx != mp->cck_rates[idx]) continue; /* short preamble */ if ((mi->supported[group] & BIT(idx + 4)) && (rate->flags & IEEE80211_TX_RC_USE_SHORT_PREAMBLE)) idx += 4; goto out; } group = MINSTREL_OFDM_GROUP; for (idx = 0; idx < ARRAY_SIZE(mp->ofdm_rates[0]); idx++) if (rate->idx == mp->ofdm_rates[mi->band][idx]) goto out; idx = 0; out: return &mi->groups[group].rates[idx]; } static inline struct minstrel_rate_stats * minstrel_get_ratestats(struct minstrel_ht_sta *mi, int index) { return &mi->groups[MI_RATE_GROUP(index)].rates[MI_RATE_IDX(index)]; } static inline int minstrel_get_duration(int index) { const struct mcs_group *group = &minstrel_mcs_groups[MI_RATE_GROUP(index)]; unsigned int duration = group->duration[MI_RATE_IDX(index)]; return duration << group->shift; } static unsigned int minstrel_ht_avg_ampdu_len(struct minstrel_ht_sta *mi) { int duration; if (mi->avg_ampdu_len) return MINSTREL_TRUNC(mi->avg_ampdu_len); if (minstrel_ht_is_legacy_group(MI_RATE_GROUP(mi->max_tp_rate[0]))) return 1; duration = minstrel_get_duration(mi->max_tp_rate[0]); if (duration > 400 * 1000) return 2; if (duration > 250 * 1000) return 4; if (duration > 150 * 1000) return 8; return 16; } /* * Return current throughput based on the average A-MPDU length, taking into * account the expected number of retransmissions and their expected length */ int minstrel_ht_get_tp_avg(struct minstrel_ht_sta *mi, int group, int rate, int prob_avg) { unsigned int nsecs = 0, overhead = mi->overhead; unsigned int ampdu_len = 1; /* do not account throughput if success prob is below 10% */ if (prob_avg < MINSTREL_FRAC(10, 100)) return 0; if (minstrel_ht_is_legacy_group(group)) overhead = mi->overhead_legacy; else ampdu_len = minstrel_ht_avg_ampdu_len(mi); nsecs = 1000 * overhead / ampdu_len; nsecs += minstrel_mcs_groups[group].duration[rate] << minstrel_mcs_groups[group].shift; /* * For the throughput calculation, limit the probability value to 90% to * account for collision related packet error rate fluctuation * (prob is scaled - see MINSTREL_FRAC above) */ if (prob_avg > MINSTREL_FRAC(90, 100)) prob_avg = MINSTREL_FRAC(90, 100); return MINSTREL_TRUNC(100 * ((prob_avg * 1000000) / nsecs)); } /* * Find & sort topmost throughput rates * * If multiple rates provide equal throughput the sorting is based on their * current success probability. Higher success probability is preferred among * MCS groups, CCK rates do not provide aggregation and are therefore at last. */ static void minstrel_ht_sort_best_tp_rates(struct minstrel_ht_sta *mi, u16 index, u16 *tp_list) { int cur_group, cur_idx, cur_tp_avg, cur_prob; int tmp_group, tmp_idx, tmp_tp_avg, tmp_prob; int j = MAX_THR_RATES; cur_group = MI_RATE_GROUP(index); cur_idx = MI_RATE_IDX(index); cur_prob = mi->groups[cur_group].rates[cur_idx].prob_avg; cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, cur_prob); do { tmp_group = MI_RATE_GROUP(tp_list[j - 1]); tmp_idx = MI_RATE_IDX(tp_list[j - 1]); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (cur_tp_avg < tmp_tp_avg || (cur_tp_avg == tmp_tp_avg && cur_prob <= tmp_prob)) break; j--; } while (j > 0); if (j < MAX_THR_RATES - 1) { memmove(&tp_list[j + 1], &tp_list[j], (sizeof(*tp_list) * (MAX_THR_RATES - (j + 1)))); } if (j < MAX_THR_RATES) tp_list[j] = index; } /* * Find and set the topmost probability rate per sta and per group */ static void minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 *dest, u16 index) { struct minstrel_mcs_group_data *mg; struct minstrel_rate_stats *mrs; int tmp_group, tmp_idx, tmp_tp_avg, tmp_prob; int max_tp_group, max_tp_idx, max_tp_prob; int cur_tp_avg, cur_group, cur_idx; int max_gpr_group, max_gpr_idx; int max_gpr_tp_avg, max_gpr_prob; cur_group = MI_RATE_GROUP(index); cur_idx = MI_RATE_IDX(index); mg = &mi->groups[cur_group]; mrs = &mg->rates[cur_idx]; tmp_group = MI_RATE_GROUP(*dest); tmp_idx = MI_RATE_IDX(*dest); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_tp_avg = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); /* if max_tp_rate[0] is from MCS_GROUP max_prob_rate get selected from * MCS_GROUP as well as CCK_GROUP rates do not allow aggregation */ max_tp_group = MI_RATE_GROUP(mi->max_tp_rate[0]); max_tp_idx = MI_RATE_IDX(mi->max_tp_rate[0]); max_tp_prob = mi->groups[max_tp_group].rates[max_tp_idx].prob_avg; if (minstrel_ht_is_legacy_group(MI_RATE_GROUP(index)) && !minstrel_ht_is_legacy_group(max_tp_group)) return; /* skip rates faster than max tp rate with lower prob */ if (minstrel_get_duration(mi->max_tp_rate[0]) > minstrel_get_duration(index) && mrs->prob_avg < max_tp_prob) return; max_gpr_group = MI_RATE_GROUP(mg->max_group_prob_rate); max_gpr_idx = MI_RATE_IDX(mg->max_group_prob_rate); max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_avg; if (mrs->prob_avg > MINSTREL_FRAC(75, 100)) { cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx, mrs->prob_avg); if (cur_tp_avg > tmp_tp_avg) *dest = index; max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group, max_gpr_idx, max_gpr_prob); if (cur_tp_avg > max_gpr_tp_avg) mg->max_group_prob_rate = index; } else { if (mrs->prob_avg > tmp_prob) *dest = index; if (mrs->prob_avg > max_gpr_prob) mg->max_group_prob_rate = index; } } /* * Assign new rate set per sta and use CCK rates only if the fastest * rate (max_tp_rate[0]) is from CCK group. This prohibits such sorted * rate sets where MCS and CCK rates are mixed, because CCK rates can * not use aggregation. */ static void minstrel_ht_assign_best_tp_rates(struct minstrel_ht_sta *mi, u16 tmp_mcs_tp_rate[MAX_THR_RATES], u16 tmp_legacy_tp_rate[MAX_THR_RATES]) { unsigned int tmp_group, tmp_idx, tmp_cck_tp, tmp_mcs_tp, tmp_prob; int i; tmp_group = MI_RATE_GROUP(tmp_legacy_tp_rate[0]); tmp_idx = MI_RATE_IDX(tmp_legacy_tp_rate[0]); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_cck_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); tmp_group = MI_RATE_GROUP(tmp_mcs_tp_rate[0]); tmp_idx = MI_RATE_IDX(tmp_mcs_tp_rate[0]); tmp_prob = mi->groups[tmp_group].rates[tmp_idx].prob_avg; tmp_mcs_tp = minstrel_ht_get_tp_avg(mi, tmp_group, tmp_idx, tmp_prob); if (tmp_cck_tp > tmp_mcs_tp) { for(i = 0; i < MAX_THR_RATES; i++) { minstrel_ht_sort_best_tp_rates(mi, tmp_legacy_tp_rate[i], tmp_mcs_tp_rate); } } } /* * Try to increase robustness of max_prob rate by decrease number of * streams if possible. */ static inline void minstrel_ht_prob_rate_reduce_streams(struct minstrel_ht_sta *mi) { struct minstrel_mcs_group_data *mg; int tmp_max_streams, group, tmp_idx, tmp_prob; int tmp_tp = 0; if (!mi->sta->ht_cap.ht_supported) return; group = MI_RATE_GROUP(mi->max_tp_rate[0]); tmp_max_streams = minstrel_mcs_groups[group].streams; for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { mg = &mi->groups[group]; if (!mi->supported[group] || group == MINSTREL_CCK_GROUP) continue; tmp_idx = MI_RATE_IDX(mg->max_group_prob_rate); tmp_prob = mi->groups[group].rates[tmp_idx].prob_avg; if (tmp_tp < minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob) && (minstrel_mcs_groups[group].streams < tmp_max_streams)) { mi->max_prob_rate = mg->max_group_prob_rate; tmp_tp = minstrel_ht_get_tp_avg(mi, group, tmp_idx, tmp_prob); } } } static u16 __minstrel_ht_get_sample_rate(struct minstrel_ht_sta *mi, enum minstrel_sample_type type) { u16 *rates = mi->sample[type].sample_rates; u16 cur; int i; for (i = 0; i < MINSTREL_SAMPLE_RATES; i++) { if (!rates[i]) continue; cur = rates[i]; rates[i] = 0; return cur; } return 0; } static inline int minstrel_ewma(int old, int new, int weight) { int diff, incr; diff = new - old; incr = (EWMA_DIV - weight) * diff / EWMA_DIV; return old + incr; } static inline int minstrel_filter_avg_add(u16 *prev_1, u16 *prev_2, s32 in) { s32 out_1 = *prev_1; s32 out_2 = *prev_2; s32 val; if (!in) in += 1; if (!out_1) { val = out_1 = in; goto out; } val = MINSTREL_AVG_COEFF1 * in; val += MINSTREL_AVG_COEFF2 * out_1; val += MINSTREL_AVG_COEFF3 * out_2; val >>= MINSTREL_SCALE; if (val > 1 << MINSTREL_SCALE) val = 1 << MINSTREL_SCALE; if (val < 0) val = 1; out: *prev_2 = out_1; *prev_1 = val; return val; } /* * Recalculate statistics and counters of a given rate */ static void minstrel_ht_calc_rate_stats(struct minstrel_priv *mp, struct minstrel_rate_stats *mrs) { unsigned int cur_prob; if (unlikely(mrs->attempts > 0)) { cur_prob = MINSTREL_FRAC(mrs->success, mrs->attempts); minstrel_filter_avg_add(&mrs->prob_avg, &mrs->prob_avg_1, cur_prob); mrs->att_hist += mrs->attempts; mrs->succ_hist += mrs->success; } mrs->last_success = mrs->success; mrs->last_attempts = mrs->attempts; mrs->success = 0; mrs->attempts = 0; } static bool minstrel_ht_find_sample_rate(struct minstrel_ht_sta *mi, int type, int idx) { int i; for (i = 0; i < MINSTREL_SAMPLE_RATES; i++) { u16 cur = mi->sample[type].sample_rates[i]; if (cur == idx) return true; if (!cur) break; } return false; } static int minstrel_ht_move_sample_rates(struct minstrel_ht_sta *mi, int type, u32 fast_rate_dur, u32 slow_rate_dur) { u16 *rates = mi->sample[type].sample_rates; int i, j; for (i = 0, j = 0; i < MINSTREL_SAMPLE_RATES; i++) { u32 duration; bool valid = false; u16 cur; cur = rates[i]; if (!cur) continue; duration = minstrel_get_duration(cur); switch (type) { case MINSTREL_SAMPLE_TYPE_SLOW: valid = duration > fast_rate_dur && duration < slow_rate_dur; break; case MINSTREL_SAMPLE_TYPE_INC: case MINSTREL_SAMPLE_TYPE_JUMP: valid = duration < fast_rate_dur; break; default: valid = false; break; } if (!valid) { rates[i] = 0; continue; } if (i == j) continue; rates[j++] = cur; rates[i] = 0; } return j; } static int minstrel_ht_group_min_rate_offset(struct minstrel_ht_sta *mi, int group, u32 max_duration) { u16 supported = mi->supported[group]; int i; for (i = 0; i < MCS_GROUP_RATES && supported; i++, supported >>= 1) { if (!(supported & BIT(0))) continue; if (minstrel_get_duration(MI_RATE(group, i)) >= max_duration) continue; return i; } return -1; } /* * Incremental update rates: * Flip through groups and pick the first group rate that is faster than the * highest currently selected rate */ static u16 minstrel_ht_next_inc_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur) { u8 type = MINSTREL_SAMPLE_TYPE_INC; int i, index = 0; u8 group; group = mi->sample[type].sample_group; for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) { group = (group + 1) % ARRAY_SIZE(minstrel_mcs_groups); index = minstrel_ht_group_min_rate_offset(mi, group, fast_rate_dur); if (index < 0) continue; index = MI_RATE(group, index & 0xf); if (!minstrel_ht_find_sample_rate(mi, type, index)) goto out; } index = 0; out: mi->sample[type].sample_group = group; return index; } static int minstrel_ht_next_group_sample_rate(struct minstrel_ht_sta *mi, int group, u16 supported, int offset) { struct minstrel_mcs_group_data *mg = &mi->groups[group]; u16 idx; int i; for (i = 0; i < MCS_GROUP_RATES; i++) { idx = sample_table[mg->column][mg->index]; if (++mg->index >= MCS_GROUP_RATES) { mg->index = 0; if (++mg->column >= ARRAY_SIZE(sample_table)) mg->column = 0; } if (idx < offset) continue; if (!(supported & BIT(idx))) continue; return MI_RATE(group, idx); } return -1; } /* * Jump rates: * Sample random rates, use those that are faster than the highest * currently selected rate. Rates between the fastest and the slowest * get sorted into the slow sample bucket, but only if it has room */ static u16 minstrel_ht_next_jump_rate(struct minstrel_ht_sta *mi, u32 fast_rate_dur, u32 slow_rate_dur, int *slow_rate_ofs) { struct minstrel_rate_stats *mrs; u32 max_duration = slow_rate_dur; int i, index, offset; u16 *slow_rates; u16 supported; u32 duration; u8 group; if (*slow_rate_ofs >= MINSTREL_SAMPLE_RATES) max_duration = fast_rate_dur; slow_rates = mi->sample[MINSTREL_SAMPLE_TYPE_SLOW].sample_rates; group = mi->sample[MINSTREL_SAMPLE_TYPE_JUMP].sample_group; for (i = 0; i < ARRAY_SIZE(minstrel_mcs_groups); i++) { u8 type; group = (group + 1) % ARRAY_SIZE(minstrel_mcs_groups); supported = mi->supported[group]; if (!supported) continue; offset = minstrel_ht_group_min_rate_offset(mi, group, max_duration); if (offset < 0) continue; index = minstrel_ht_next_group_sample_rate(mi, group, supported, offset); if (index < 0) continue; duration = minstrel_get_duration(index); if (duration < fast_rate_dur) type = MINSTREL_SAMPLE_TYPE_JUMP; else type = MINSTREL_SAMPLE_TYPE_SLOW; if (minstrel_ht_find_sample_rate(mi, type, index)) continue; if (type == MINSTREL_SAMPLE_TYPE_JUMP) goto found; if (*slow_rate_ofs >= MINSTREL_SAMPLE_RATES) continue; if (duration >= slow_rate_dur) continue; /* skip slow rates with high success probability */ mrs = minstrel_get_ratestats(mi, index); if (mrs->prob_avg > MINSTREL_FRAC(95, 100)) continue; slow_rates[(*slow_rate_ofs)++] = index; if (*slow_rate_ofs >= MINSTREL_SAMPLE_RATES) max_duration = fast_rate_dur; } index = 0; found: mi->sample[MINSTREL_SAMPLE_TYPE_JUMP].sample_group = group; return index; } static void minstrel_ht_refill_sample_rates(struct minstrel_ht_sta *mi) { u32 prob_dur = minstrel_get_duration(mi->max_prob_rate); u32 tp_dur = minstrel_get_duration(mi->max_tp_rate[0]); u32 tp2_dur = minstrel_get_duration(mi->max_tp_rate[1]); u32 fast_rate_dur = min(min(tp_dur, tp2_dur), prob_dur); u32 slow_rate_dur = max(max(tp_dur, tp2_dur), prob_dur); u16 *rates; int i, j; rates = mi->sample[MINSTREL_SAMPLE_TYPE_INC].sample_rates; i = minstrel_ht_move_sample_rates(mi, MINSTREL_SAMPLE_TYPE_INC, fast_rate_dur, slow_rate_dur); while (i < MINSTREL_SAMPLE_RATES) { rates[i] = minstrel_ht_next_inc_rate(mi, tp_dur); if (!rates[i]) break; i++; } rates = mi->sample[MINSTREL_SAMPLE_TYPE_JUMP].sample_rates; i = minstrel_ht_move_sample_rates(mi, MINSTREL_SAMPLE_TYPE_JUMP, fast_rate_dur, slow_rate_dur); j = minstrel_ht_move_sample_rates(mi, MINSTREL_SAMPLE_TYPE_SLOW, fast_rate_dur, slow_rate_dur); while (i < MINSTREL_SAMPLE_RATES) { rates[i] = minstrel_ht_next_jump_rate(mi, fast_rate_dur, slow_rate_dur, &j); if (!rates[i]) break; i++; } for (i = 0; i < ARRAY_SIZE(mi->sample); i++) memcpy(mi->sample[i].cur_sample_rates, mi->sample[i].sample_rates, sizeof(mi->sample[i].cur_sample_rates)); } /* * Update rate statistics and select new primary rates * * Rules for rate selection: * - max_prob_rate must use only one stream, as a tradeoff between delivery * probability and throughput during strong fluctuations * - as long as the max prob rate has a probability of more than 75%, pick * higher throughput rates, even if the probablity is a bit lower */ static void minstrel_ht_update_stats(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct minstrel_mcs_group_data *mg; struct minstrel_rate_stats *mrs; int group, i, j, cur_prob; u16 tmp_mcs_tp_rate[MAX_THR_RATES], tmp_group_tp_rate[MAX_THR_RATES]; u16 tmp_legacy_tp_rate[MAX_THR_RATES], tmp_max_prob_rate; u16 index; bool ht_supported = mi->sta->ht_cap.ht_supported; if (mi->ampdu_packets > 0) { if (!ieee80211_hw_check(mp->hw, TX_STATUS_NO_AMPDU_LEN)) mi->avg_ampdu_len = minstrel_ewma(mi->avg_ampdu_len, MINSTREL_FRAC(mi->ampdu_len, mi->ampdu_packets), EWMA_LEVEL); else mi->avg_ampdu_len = 0; mi->ampdu_len = 0; mi->ampdu_packets = 0; } if (mi->supported[MINSTREL_CCK_GROUP]) group = MINSTREL_CCK_GROUP; else if (mi->supported[MINSTREL_OFDM_GROUP]) group = MINSTREL_OFDM_GROUP; else group = 0; index = MI_RATE(group, 0); for (j = 0; j < ARRAY_SIZE(tmp_legacy_tp_rate); j++) tmp_legacy_tp_rate[j] = index; if (mi->supported[MINSTREL_VHT_GROUP_0]) group = MINSTREL_VHT_GROUP_0; else if (ht_supported) group = MINSTREL_HT_GROUP_0; else if (mi->supported[MINSTREL_CCK_GROUP]) group = MINSTREL_CCK_GROUP; else group = MINSTREL_OFDM_GROUP; index = MI_RATE(group, 0); tmp_max_prob_rate = index; for (j = 0; j < ARRAY_SIZE(tmp_mcs_tp_rate); j++) tmp_mcs_tp_rate[j] = index; /* Find best rate sets within all MCS groups*/ for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { u16 *tp_rate = tmp_mcs_tp_rate; u16 last_prob = 0; mg = &mi->groups[group]; if (!mi->supported[group]) continue; /* (re)Initialize group rate indexes */ for(j = 0; j < MAX_THR_RATES; j++) tmp_group_tp_rate[j] = MI_RATE(group, 0); if (group == MINSTREL_CCK_GROUP && ht_supported) tp_rate = tmp_legacy_tp_rate; for (i = MCS_GROUP_RATES - 1; i >= 0; i--) { if (!(mi->supported[group] & BIT(i))) continue; index = MI_RATE(group, i); mrs = &mg->rates[i]; mrs->retry_updated = false; minstrel_ht_calc_rate_stats(mp, mrs); if (mrs->att_hist) last_prob = max(last_prob, mrs->prob_avg); else mrs->prob_avg = max(last_prob, mrs->prob_avg); cur_prob = mrs->prob_avg; if (minstrel_ht_get_tp_avg(mi, group, i, cur_prob) == 0) continue; /* Find max throughput rate set */ minstrel_ht_sort_best_tp_rates(mi, index, tp_rate); /* Find max throughput rate set within a group */ minstrel_ht_sort_best_tp_rates(mi, index, tmp_group_tp_rate); } memcpy(mg->max_group_tp_rate, tmp_group_tp_rate, sizeof(mg->max_group_tp_rate)); } /* Assign new rate set per sta */ minstrel_ht_assign_best_tp_rates(mi, tmp_mcs_tp_rate, tmp_legacy_tp_rate); memcpy(mi->max_tp_rate, tmp_mcs_tp_rate, sizeof(mi->max_tp_rate)); for (group = 0; group < ARRAY_SIZE(minstrel_mcs_groups); group++) { if (!mi->supported[group]) continue; mg = &mi->groups[group]; mg->max_group_prob_rate = MI_RATE(group, 0); for (i = 0; i < MCS_GROUP_RATES; i++) { if (!(mi->supported[group] & BIT(i))) continue; index = MI_RATE(group, i); /* Find max probability rate per group and global */ minstrel_ht_set_best_prob_rate(mi, &tmp_max_prob_rate, index); } } mi->max_prob_rate = tmp_max_prob_rate; /* Try to increase robustness of max_prob_rate*/ minstrel_ht_prob_rate_reduce_streams(mi); minstrel_ht_refill_sample_rates(mi); #ifdef CONFIG_MAC80211_DEBUGFS /* use fixed index if set */ if (mp->fixed_rate_idx != -1) { for (i = 0; i < 4; i++) mi->max_tp_rate[i] = mp->fixed_rate_idx; mi->max_prob_rate = mp->fixed_rate_idx; } #endif /* Reset update timer */ mi->last_stats_update = jiffies; mi->sample_time = jiffies; } static bool minstrel_ht_txstat_valid(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_tx_rate *rate) { int i; if (rate->idx < 0) return false; if (!rate->count) return false; if (rate->flags & IEEE80211_TX_RC_MCS || rate->flags & IEEE80211_TX_RC_VHT_MCS) return true; for (i = 0; i < ARRAY_SIZE(mp->cck_rates); i++) if (rate->idx == mp->cck_rates[i]) return true; for (i = 0; i < ARRAY_SIZE(mp->ofdm_rates[0]); i++) if (rate->idx == mp->ofdm_rates[mi->band][i]) return true; return false; } static void minstrel_downgrade_rate(struct minstrel_ht_sta *mi, u16 *idx, bool primary) { int group, orig_group; orig_group = group = MI_RATE_GROUP(*idx); while (group > 0) { group--; if (!mi->supported[group]) continue; if (minstrel_mcs_groups[group].streams > minstrel_mcs_groups[orig_group].streams) continue; if (primary) *idx = mi->groups[group].max_group_tp_rate[0]; else *idx = mi->groups[group].max_group_tp_rate[1]; break; } } static void minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, void *priv_sta, struct ieee80211_tx_status *st) { struct ieee80211_tx_info *info = st->info; struct minstrel_ht_sta *mi = priv_sta; struct ieee80211_tx_rate *ar = info->status.rates; struct minstrel_rate_stats *rate, *rate2; struct minstrel_priv *mp = priv; u32 update_interval = mp->update_interval; bool last, update = false; int i; /* Ignore packet that was sent with noAck flag */ if (info->flags & IEEE80211_TX_CTL_NO_ACK) return; /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; if (!(info->flags & IEEE80211_TX_STAT_AMPDU)) { info->status.ampdu_ack_len = (info->flags & IEEE80211_TX_STAT_ACK ? 1 : 0); info->status.ampdu_len = 1; } /* wraparound */ if (mi->total_packets >= ~0 - info->status.ampdu_len) { mi->total_packets = 0; mi->sample_packets = 0; } mi->total_packets += info->status.ampdu_len; if (info->flags & IEEE80211_TX_CTL_RATE_CTRL_PROBE) mi->sample_packets += info->status.ampdu_len; mi->ampdu_packets++; mi->ampdu_len += info->status.ampdu_len; last = !minstrel_ht_txstat_valid(mp, mi, &ar[0]); for (i = 0; !last; i++) { last = (i == IEEE80211_TX_MAX_RATES - 1) || !minstrel_ht_txstat_valid(mp, mi, &ar[i + 1]); rate = minstrel_ht_get_stats(mp, mi, &ar[i]); if (last) rate->success += info->status.ampdu_ack_len; rate->attempts += ar[i].count * info->status.ampdu_len; } if (mp->hw->max_rates > 1) { /* * check for sudden death of spatial multiplexing, * downgrade to a lower number of streams if necessary. */ rate = minstrel_get_ratestats(mi, mi->max_tp_rate[0]); if (rate->attempts > 30 && rate->success < rate->attempts / 4) { minstrel_downgrade_rate(mi, &mi->max_tp_rate[0], true); update = true; } rate2 = minstrel_get_ratestats(mi, mi->max_tp_rate[1]); if (rate2->attempts > 30 && rate2->success < rate2->attempts / 4) { minstrel_downgrade_rate(mi, &mi->max_tp_rate[1], false); update = true; } } if (time_after(jiffies, mi->last_stats_update + update_interval)) { update = true; minstrel_ht_update_stats(mp, mi); } if (update) minstrel_ht_update_rates(mp, mi); } static void minstrel_calc_retransmit(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, int index) { struct minstrel_rate_stats *mrs; unsigned int tx_time, tx_time_rtscts, tx_time_data; unsigned int cw = mp->cw_min; unsigned int ctime = 0; unsigned int t_slot = 9; /* FIXME */ unsigned int ampdu_len = minstrel_ht_avg_ampdu_len(mi); unsigned int overhead = 0, overhead_rtscts = 0; mrs = minstrel_get_ratestats(mi, index); if (mrs->prob_avg < MINSTREL_FRAC(1, 10)) { mrs->retry_count = 1; mrs->retry_count_rtscts = 1; return; } mrs->retry_count = 2; mrs->retry_count_rtscts = 2; mrs->retry_updated = true; tx_time_data = minstrel_get_duration(index) * ampdu_len / 1000; /* Contention time for first 2 tries */ ctime = (t_slot * cw) >> 1; cw = min((cw << 1) | 1, mp->cw_max); ctime += (t_slot * cw) >> 1; cw = min((cw << 1) | 1, mp->cw_max); if (minstrel_ht_is_legacy_group(MI_RATE_GROUP(index))) { overhead = mi->overhead_legacy; overhead_rtscts = mi->overhead_legacy_rtscts; } else { overhead = mi->overhead; overhead_rtscts = mi->overhead_rtscts; } /* Total TX time for data and Contention after first 2 tries */ tx_time = ctime + 2 * (overhead + tx_time_data); tx_time_rtscts = ctime + 2 * (overhead_rtscts + tx_time_data); /* See how many more tries we can fit inside segment size */ do { /* Contention time for this try */ ctime = (t_slot * cw) >> 1; cw = min((cw << 1) | 1, mp->cw_max); /* Total TX time after this try */ tx_time += ctime + overhead + tx_time_data; tx_time_rtscts += ctime + overhead_rtscts + tx_time_data; if (tx_time_rtscts < mp->segment_size) mrs->retry_count_rtscts++; } while ((tx_time < mp->segment_size) && (++mrs->retry_count < mp->max_retry)); } static void minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_sta_rates *ratetbl, int offset, int index) { int group_idx = MI_RATE_GROUP(index); const struct mcs_group *group = &minstrel_mcs_groups[group_idx]; struct minstrel_rate_stats *mrs; u8 idx; u16 flags = group->flags; mrs = minstrel_get_ratestats(mi, index); if (!mrs->retry_updated) minstrel_calc_retransmit(mp, mi, index); if (mrs->prob_avg < MINSTREL_FRAC(20, 100) || !mrs->retry_count) { ratetbl->rate[offset].count = 2; ratetbl->rate[offset].count_rts = 2; ratetbl->rate[offset].count_cts = 2; } else { ratetbl->rate[offset].count = mrs->retry_count; ratetbl->rate[offset].count_cts = mrs->retry_count; ratetbl->rate[offset].count_rts = mrs->retry_count_rtscts; } index = MI_RATE_IDX(index); if (group_idx == MINSTREL_CCK_GROUP) idx = mp->cck_rates[index % ARRAY_SIZE(mp->cck_rates)]; else if (group_idx == MINSTREL_OFDM_GROUP) idx = mp->ofdm_rates[mi->band][index % ARRAY_SIZE(mp->ofdm_rates[0])]; else if (flags & IEEE80211_TX_RC_VHT_MCS) idx = ((group->streams - 1) << 4) | (index & 0xF); else idx = index + (group->streams - 1) * 8; /* enable RTS/CTS if needed: * - if station is in dynamic SMPS (and streams > 1) * - for fallback rates, to increase chances of getting through */ if (offset > 0 || (mi->sta->smps_mode == IEEE80211_SMPS_DYNAMIC && group->streams > 1)) { ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts; flags |= IEEE80211_TX_RC_USE_RTS_CTS; } ratetbl->rate[offset].idx = idx; ratetbl->rate[offset].flags = flags; } static inline int minstrel_ht_get_prob_avg(struct minstrel_ht_sta *mi, int rate) { int group = MI_RATE_GROUP(rate); rate = MI_RATE_IDX(rate); return mi->groups[group].rates[rate].prob_avg; } static int minstrel_ht_get_max_amsdu_len(struct minstrel_ht_sta *mi) { int group = MI_RATE_GROUP(mi->max_prob_rate); const struct mcs_group *g = &minstrel_mcs_groups[group]; int rate = MI_RATE_IDX(mi->max_prob_rate); unsigned int duration; /* Disable A-MSDU if max_prob_rate is bad */ if (mi->groups[group].rates[rate].prob_avg < MINSTREL_FRAC(50, 100)) return 1; duration = g->duration[rate]; duration <<= g->shift; /* If the rate is slower than single-stream MCS1, make A-MSDU limit small */ if (duration > MCS_DURATION(1, 0, 52)) return 500; /* * If the rate is slower than single-stream MCS4, limit A-MSDU to usual * data packet size */ if (duration > MCS_DURATION(1, 0, 104)) return 1600; /* * If the rate is slower than single-stream MCS7, or if the max throughput * rate success probability is less than 75%, limit A-MSDU to twice the usual * data packet size */ if (duration > MCS_DURATION(1, 0, 260) || (minstrel_ht_get_prob_avg(mi, mi->max_tp_rate[0]) < MINSTREL_FRAC(75, 100))) return 3200; /* * HT A-MPDU limits maximum MPDU size under BA agreement to 4095 bytes. * Since aggregation sessions are started/stopped without txq flush, use * the limit here to avoid the complexity of having to de-aggregate * packets in the queue. */ if (!mi->sta->vht_cap.vht_supported) return IEEE80211_MAX_MPDU_LEN_HT_BA; /* unlimited */ return 0; } static void minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct ieee80211_sta_rates *rates; int i = 0; rates = kzalloc(sizeof(*rates), GFP_ATOMIC); if (!rates) return; /* Start with max_tp_rate[0] */ minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[0]); if (mp->hw->max_rates >= 3) { /* At least 3 tx rates supported, use max_tp_rate[1] next */ minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[1]); } if (mp->hw->max_rates >= 2) { minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_prob_rate); } mi->sta->max_rc_amsdu_len = minstrel_ht_get_max_amsdu_len(mi); rates->rate[i].idx = -1; rate_control_set_rates(mp->hw, mi->sta, rates); } static u16 minstrel_ht_get_sample_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { u8 seq; if (mp->hw->max_rates > 1) { seq = mi->sample_seq; mi->sample_seq = (seq + 1) % ARRAY_SIZE(minstrel_sample_seq); seq = minstrel_sample_seq[seq]; } else { seq = MINSTREL_SAMPLE_TYPE_INC; } return __minstrel_ht_get_sample_rate(mi, seq); } static void minstrel_ht_get_rate(void *priv, struct ieee80211_sta *sta, void *priv_sta, struct ieee80211_tx_rate_control *txrc) { const struct mcs_group *sample_group; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(txrc->skb); struct ieee80211_tx_rate *rate = &info->status.rates[0]; struct minstrel_ht_sta *mi = priv_sta; struct minstrel_priv *mp = priv; u16 sample_idx; info->flags |= mi->tx_flags; #ifdef CONFIG_MAC80211_DEBUGFS if (mp->fixed_rate_idx != -1) return; #endif /* Don't use EAPOL frames for sampling on non-mrr hw */ if (mp->hw->max_rates == 1 && (info->control.flags & IEEE80211_TX_CTRL_PORT_CTRL_PROTO)) return; if (time_is_after_jiffies(mi->sample_time)) return; mi->sample_time = jiffies + MINSTREL_SAMPLE_INTERVAL; sample_idx = minstrel_ht_get_sample_rate(mp, mi); if (!sample_idx) return; sample_group = &minstrel_mcs_groups[MI_RATE_GROUP(sample_idx)]; sample_idx = MI_RATE_IDX(sample_idx); if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP] && (sample_idx >= 4) != txrc->short_preamble) return; info->flags |= IEEE80211_TX_CTL_RATE_CTRL_PROBE; rate->count = 1; if (sample_group == &minstrel_mcs_groups[MINSTREL_CCK_GROUP]) { int idx = sample_idx % ARRAY_SIZE(mp->cck_rates); rate->idx = mp->cck_rates[idx]; } else if (sample_group == &minstrel_mcs_groups[MINSTREL_OFDM_GROUP]) { int idx = sample_idx % ARRAY_SIZE(mp->ofdm_rates[0]); rate->idx = mp->ofdm_rates[mi->band][idx]; } else if (sample_group->flags & IEEE80211_TX_RC_VHT_MCS) { ieee80211_rate_set_vht(rate, MI_RATE_IDX(sample_idx), sample_group->streams); } else { rate->idx = sample_idx + (sample_group->streams - 1) * 8; } rate->flags = sample_group->flags; } static void minstrel_ht_update_cck(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta) { int i; if (sband->band != NL80211_BAND_2GHZ) return; if (sta->ht_cap.ht_supported && !ieee80211_hw_check(mp->hw, SUPPORTS_HT_CCK_RATES)) return; for (i = 0; i < 4; i++) { if (mp->cck_rates[i] == 0xff || !rate_supported(sta, sband->band, mp->cck_rates[i])) continue; mi->supported[MINSTREL_CCK_GROUP] |= BIT(i); if (sband->bitrates[i].flags & IEEE80211_RATE_SHORT_PREAMBLE) mi->supported[MINSTREL_CCK_GROUP] |= BIT(i + 4); } } static void minstrel_ht_update_ofdm(struct minstrel_priv *mp, struct minstrel_ht_sta *mi, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta) { const u8 *rates; int i; if (sta->ht_cap.ht_supported) return; rates = mp->ofdm_rates[sband->band]; for (i = 0; i < ARRAY_SIZE(mp->ofdm_rates[0]); i++) { if (rates[i] == 0xff || !rate_supported(sta, sband->band, rates[i])) continue; mi->supported[MINSTREL_OFDM_GROUP] |= BIT(i); } } static void minstrel_ht_update_caps(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta) { struct minstrel_priv *mp = priv; struct minstrel_ht_sta *mi = priv_sta; struct ieee80211_mcs_info *mcs = &sta->ht_cap.mcs; u16 ht_cap = sta->ht_cap.cap; struct ieee80211_sta_vht_cap *vht_cap = &sta->vht_cap; const struct ieee80211_rate *ctl_rate; bool ldpc, erp; int use_vht; int n_supported = 0; int ack_dur; int stbc; int i; BUILD_BUG_ON(ARRAY_SIZE(minstrel_mcs_groups) != MINSTREL_GROUPS_NB); if (vht_cap->vht_supported) use_vht = vht_cap->vht_mcs.tx_mcs_map != cpu_to_le16(~0); else use_vht = 0; memset(mi, 0, sizeof(*mi)); mi->sta = sta; mi->band = sband->band; mi->last_stats_update = jiffies; ack_dur = ieee80211_frame_duration(sband->band, 10, 60, 1, 1, 0); mi->overhead = ieee80211_frame_duration(sband->band, 0, 60, 1, 1, 0); mi->overhead += ack_dur; mi->overhead_rtscts = mi->overhead + 2 * ack_dur; ctl_rate = &sband->bitrates[rate_lowest_index(sband, sta)]; erp = ctl_rate->flags & IEEE80211_RATE_ERP_G; ack_dur = ieee80211_frame_duration(sband->band, 10, ctl_rate->bitrate, erp, 1, ieee80211_chandef_get_shift(chandef)); mi->overhead_legacy = ack_dur; mi->overhead_legacy_rtscts = mi->overhead_legacy + 2 * ack_dur; mi->avg_ampdu_len = MINSTREL_FRAC(1, 1); if (!use_vht) { stbc = (ht_cap & IEEE80211_HT_CAP_RX_STBC) >> IEEE80211_HT_CAP_RX_STBC_SHIFT; ldpc = ht_cap & IEEE80211_HT_CAP_LDPC_CODING; } else { stbc = (vht_cap->cap & IEEE80211_VHT_CAP_RXSTBC_MASK) >> IEEE80211_VHT_CAP_RXSTBC_SHIFT; ldpc = vht_cap->cap & IEEE80211_VHT_CAP_RXLDPC; } mi->tx_flags |= stbc << IEEE80211_TX_CTL_STBC_SHIFT; if (ldpc) mi->tx_flags |= IEEE80211_TX_CTL_LDPC; for (i = 0; i < ARRAY_SIZE(mi->groups); i++) { u32 gflags = minstrel_mcs_groups[i].flags; int bw, nss; mi->supported[i] = 0; if (minstrel_ht_is_legacy_group(i)) continue; if (gflags & IEEE80211_TX_RC_SHORT_GI) { if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) { if (!(ht_cap & IEEE80211_HT_CAP_SGI_40)) continue; } else { if (!(ht_cap & IEEE80211_HT_CAP_SGI_20)) continue; } } if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH && sta->bandwidth < IEEE80211_STA_RX_BW_40) continue; nss = minstrel_mcs_groups[i].streams; /* Mark MCS > 7 as unsupported if STA is in static SMPS mode */ if (sta->smps_mode == IEEE80211_SMPS_STATIC && nss > 1) continue; /* HT rate */ if (gflags & IEEE80211_TX_RC_MCS) { if (use_vht && minstrel_vht_only) continue; mi->supported[i] = mcs->rx_mask[nss - 1]; if (mi->supported[i]) n_supported++; continue; } /* VHT rate */ if (!vht_cap->vht_supported || WARN_ON(!(gflags & IEEE80211_TX_RC_VHT_MCS)) || WARN_ON(gflags & IEEE80211_TX_RC_160_MHZ_WIDTH)) continue; if (gflags & IEEE80211_TX_RC_80_MHZ_WIDTH) { if (sta->bandwidth < IEEE80211_STA_RX_BW_80 || ((gflags & IEEE80211_TX_RC_SHORT_GI) && !(vht_cap->cap & IEEE80211_VHT_CAP_SHORT_GI_80))) { continue; } } if (gflags & IEEE80211_TX_RC_40_MHZ_WIDTH) bw = BW_40; else if (gflags & IEEE80211_TX_RC_80_MHZ_WIDTH) bw = BW_80; else bw = BW_20; mi->supported[i] = minstrel_get_valid_vht_rates(bw, nss, vht_cap->vht_mcs.tx_mcs_map); if (mi->supported[i]) n_supported++; } minstrel_ht_update_cck(mp, mi, sband, sta); minstrel_ht_update_ofdm(mp, mi, sband, sta); /* create an initial rate table with the lowest supported rates */ minstrel_ht_update_stats(mp, mi); minstrel_ht_update_rates(mp, mi); } static void minstrel_ht_rate_init(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta) { minstrel_ht_update_caps(priv, sband, chandef, sta, priv_sta); } static void minstrel_ht_rate_update(void *priv, struct ieee80211_supported_band *sband, struct cfg80211_chan_def *chandef, struct ieee80211_sta *sta, void *priv_sta, u32 changed) { minstrel_ht_update_caps(priv, sband, chandef, sta, priv_sta); } static void * minstrel_ht_alloc_sta(void *priv, struct ieee80211_sta *sta, gfp_t gfp) { struct ieee80211_supported_band *sband; struct minstrel_ht_sta *mi; struct minstrel_priv *mp = priv; struct ieee80211_hw *hw = mp->hw; int max_rates = 0; int i; for (i = 0; i < NUM_NL80211_BANDS; i++) { sband = hw->wiphy->bands[i]; if (sband && sband->n_bitrates > max_rates) max_rates = sband->n_bitrates; } return kzalloc(sizeof(*mi), gfp); } static void minstrel_ht_free_sta(void *priv, struct ieee80211_sta *sta, void *priv_sta) { kfree(priv_sta); } static void minstrel_ht_fill_rate_array(u8 *dest, struct ieee80211_supported_band *sband, const s16 *bitrates, int n_rates, u32 rate_flags) { int i, j; for (i = 0; i < sband->n_bitrates; i++) { struct ieee80211_rate *rate = &sband->bitrates[i]; if ((rate_flags & sband->bitrates[i].flags) != rate_flags) continue; for (j = 0; j < n_rates; j++) { if (rate->bitrate != bitrates[j]) continue; dest[j] = i; break; } } } static void minstrel_ht_init_cck_rates(struct minstrel_priv *mp) { static const s16 bitrates[4] = { 10, 20, 55, 110 }; struct ieee80211_supported_band *sband; u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); memset(mp->cck_rates, 0xff, sizeof(mp->cck_rates)); sband = mp->hw->wiphy->bands[NL80211_BAND_2GHZ]; if (!sband) return; BUILD_BUG_ON(ARRAY_SIZE(mp->cck_rates) != ARRAY_SIZE(bitrates)); minstrel_ht_fill_rate_array(mp->cck_rates, sband, minstrel_cck_bitrates, ARRAY_SIZE(minstrel_cck_bitrates), rate_flags); } static void minstrel_ht_init_ofdm_rates(struct minstrel_priv *mp, enum nl80211_band band) { static const s16 bitrates[8] = { 60, 90, 120, 180, 240, 360, 480, 540 }; struct ieee80211_supported_band *sband; u32 rate_flags = ieee80211_chandef_rate_flags(&mp->hw->conf.chandef); memset(mp->ofdm_rates[band], 0xff, sizeof(mp->ofdm_rates[band])); sband = mp->hw->wiphy->bands[band]; if (!sband) return; BUILD_BUG_ON(ARRAY_SIZE(mp->ofdm_rates[band]) != ARRAY_SIZE(bitrates)); minstrel_ht_fill_rate_array(mp->ofdm_rates[band], sband, minstrel_ofdm_bitrates, ARRAY_SIZE(minstrel_ofdm_bitrates), rate_flags); } static void * minstrel_ht_alloc(struct ieee80211_hw *hw) { struct minstrel_priv *mp; int i; mp = kzalloc(sizeof(struct minstrel_priv), GFP_ATOMIC); if (!mp) return NULL; /* contention window settings * Just an approximation. Using the per-queue values would complicate * the calculations and is probably unnecessary */ mp->cw_min = 15; mp->cw_max = 1023; /* maximum time that the hw is allowed to stay in one MRR segment */ mp->segment_size = 6000; if (hw->max_rate_tries > 0) mp->max_retry = hw->max_rate_tries; else /* safe default, does not necessarily have to match hw properties */ mp->max_retry = 7; if (hw->max_rates >= 4) mp->has_mrr = true; mp->hw = hw; mp->update_interval = HZ / 20; minstrel_ht_init_cck_rates(mp); for (i = 0; i < ARRAY_SIZE(mp->hw->wiphy->bands); i++) minstrel_ht_init_ofdm_rates(mp, i); return mp; } #ifdef CONFIG_MAC80211_DEBUGFS static void minstrel_ht_add_debugfs(struct ieee80211_hw *hw, void *priv, struct dentry *debugfsdir) { struct minstrel_priv *mp = priv; mp->fixed_rate_idx = (u32) -1; debugfs_create_u32("fixed_rate_idx", S_IRUGO | S_IWUGO, debugfsdir, &mp->fixed_rate_idx); } #endif static void minstrel_ht_free(void *priv) { kfree(priv); } static u32 minstrel_ht_get_expected_throughput(void *priv_sta) { struct minstrel_ht_sta *mi = priv_sta; int i, j, prob, tp_avg; i = MI_RATE_GROUP(mi->max_tp_rate[0]); j = MI_RATE_IDX(mi->max_tp_rate[0]); prob = mi->groups[i].rates[j].prob_avg; /* convert tp_avg from pkt per second in kbps */ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10; tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024; return tp_avg; } static const struct rate_control_ops mac80211_minstrel_ht = { .name = "minstrel_ht", .capa = RATE_CTRL_CAPA_AMPDU_TRIGGER, .tx_status_ext = minstrel_ht_tx_status, .get_rate = minstrel_ht_get_rate, .rate_init = minstrel_ht_rate_init, .rate_update = minstrel_ht_rate_update, .alloc_sta = minstrel_ht_alloc_sta, .free_sta = minstrel_ht_free_sta, .alloc = minstrel_ht_alloc, .free = minstrel_ht_free, #ifdef CONFIG_MAC80211_DEBUGFS .add_debugfs = minstrel_ht_add_debugfs, .add_sta_debugfs = minstrel_ht_add_sta_debugfs, #endif .get_expected_throughput = minstrel_ht_get_expected_throughput, }; static void __init init_sample_table(void) { int col, i, new_idx; u8 rnd[MCS_GROUP_RATES]; memset(sample_table, 0xff, sizeof(sample_table)); for (col = 0; col < SAMPLE_COLUMNS; col++) { prandom_bytes(rnd, sizeof(rnd)); for (i = 0; i < MCS_GROUP_RATES; i++) { new_idx = (i + rnd[i]) % MCS_GROUP_RATES; while (sample_table[col][new_idx] != 0xff) new_idx = (new_idx + 1) % MCS_GROUP_RATES; sample_table[col][new_idx] = i; } } } int __init rc80211_minstrel_init(void) { init_sample_table(); return ieee80211_rate_control_register(&mac80211_minstrel_ht); } void rc80211_minstrel_exit(void) { ieee80211_rate_control_unregister(&mac80211_minstrel_ht); }
2804 4287 529 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM skb #if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SKB_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/tracepoint.h> #define TRACE_SKB_DROP_REASON \ EM(SKB_DROP_REASON_NOT_SPECIFIED, NOT_SPECIFIED) \ EM(SKB_DROP_REASON_NO_SOCKET, NO_SOCKET) \ EM(SKB_DROP_REASON_PKT_TOO_SMALL, PKT_TOO_SMALL) \ EM(SKB_DROP_REASON_TCP_CSUM, TCP_CSUM) \ EM(SKB_DROP_REASON_SOCKET_FILTER, SOCKET_FILTER) \ EM(SKB_DROP_REASON_UDP_CSUM, UDP_CSUM) \ EM(SKB_DROP_REASON_NETFILTER_DROP, NETFILTER_DROP) \ EM(SKB_DROP_REASON_OTHERHOST, OTHERHOST) \ EM(SKB_DROP_REASON_IP_CSUM, IP_CSUM) \ EM(SKB_DROP_REASON_IP_INHDR, IP_INHDR) \ EM(SKB_DROP_REASON_IP_RPFILTER, IP_RPFILTER) \ EM(SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST, \ UNICAST_IN_L2_MULTICAST) \ EMe(SKB_DROP_REASON_MAX, MAX) #undef EM #undef EMe #define EM(a, b) TRACE_DEFINE_ENUM(a); #define EMe(a, b) TRACE_DEFINE_ENUM(a); TRACE_SKB_DROP_REASON #undef EM #undef EMe #define EM(a, b) { a, #b }, #define EMe(a, b) { a, #b } /* * Tracepoint for free an sk_buff: */ TRACE_EVENT(kfree_skb, TP_PROTO(struct sk_buff *skb, void *location, enum skb_drop_reason reason), TP_ARGS(skb, location, reason), TP_STRUCT__entry( __field(void *, skbaddr) __field(void *, location) __field(unsigned short, protocol) __field(enum skb_drop_reason, reason) ), TP_fast_assign( __entry->skbaddr = skb; __entry->location = location; __entry->protocol = ntohs(skb->protocol); __entry->reason = reason; ), TP_printk("skbaddr=%p protocol=%u location=%p reason: %s", __entry->skbaddr, __entry->protocol, __entry->location, __print_symbolic(__entry->reason, TRACE_SKB_DROP_REASON)) ); TRACE_EVENT(consume_skb, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __field( void *, skbaddr ) ), TP_fast_assign( __entry->skbaddr = skb; ), TP_printk("skbaddr=%p", __entry->skbaddr) ); TRACE_EVENT(skb_copy_datagram_iovec, TP_PROTO(const struct sk_buff *skb, int len), TP_ARGS(skb, len), TP_STRUCT__entry( __field( const void *, skbaddr ) __field( int, len ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = len; ), TP_printk("skbaddr=%p len=%d", __entry->skbaddr, __entry->len) ); #endif /* _TRACE_SKB_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef RQ_QOS_H #define RQ_QOS_H #include <linux/kernel.h> #include <linux/blkdev.h> #include <linux/blk_types.h> #include <linux/atomic.h> #include <linux/wait.h> #include <linux/blk-mq.h> #include "blk-mq-debugfs.h" struct blk_mq_debugfs_attr; enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_LATENCY, RQ_QOS_COST, RQ_QOS_IOPRIO, }; struct rq_wait { wait_queue_head_t wait; atomic_t inflight; }; struct rq_qos { struct rq_qos_ops *ops; struct request_queue *q; enum rq_qos_id id; struct rq_qos *next; #ifdef CONFIG_BLK_DEBUG_FS struct dentry *debugfs_dir; #endif }; struct rq_qos_ops { void (*throttle)(struct rq_qos *, struct bio *); void (*track)(struct rq_qos *, struct request *, struct bio *); void (*merge)(struct rq_qos *, struct request *, struct bio *); void (*issue)(struct rq_qos *, struct request *); void (*requeue)(struct rq_qos *, struct request *); void (*done)(struct rq_qos *, struct request *); void (*done_bio)(struct rq_qos *, struct bio *); void (*cleanup)(struct rq_qos *, struct bio *); void (*queue_depth_changed)(struct rq_qos *); void (*exit)(struct rq_qos *); const struct blk_mq_debugfs_attr *debugfs_attrs; }; struct rq_depth { unsigned int max_depth; int scale_step; bool scaled_max; unsigned int queue_depth; unsigned int default_depth; }; static inline struct rq_qos *rq_qos_id(struct request_queue *q, enum rq_qos_id id) { struct rq_qos *rqos; for (rqos = q->rq_qos; rqos; rqos = rqos->next) { if (rqos->id == id) break; } return rqos; } static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) { return rq_qos_id(q, RQ_QOS_WBT); } static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) { return rq_qos_id(q, RQ_QOS_LATENCY); } static inline void rq_wait_init(struct rq_wait *rq_wait) { atomic_set(&rq_wait->inflight, 0); init_waitqueue_head(&rq_wait->wait); } static inline int rq_qos_add(struct request_queue *q, struct rq_qos *rqos) { /* * No IO can be in-flight when adding rqos, so freeze queue, which * is fine since we only support rq_qos for blk-mq queue. * * Reuse ->queue_lock for protecting against other concurrent * rq_qos adding/deleting */ blk_mq_freeze_queue(q); spin_lock_irq(&q->queue_lock); if (rq_qos_id(q, rqos->id)) goto ebusy; rqos->next = q->rq_qos; q->rq_qos = rqos; spin_unlock_irq(&q->queue_lock); blk_mq_unfreeze_queue(q); if (rqos->ops->debugfs_attrs) blk_mq_debugfs_register_rqos(rqos); return 0; ebusy: spin_unlock_irq(&q->queue_lock); blk_mq_unfreeze_queue(q); return -EBUSY; } static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) { struct rq_qos **cur; /* * See comment in rq_qos_add() about freezing queue & using * ->queue_lock. */ blk_mq_freeze_queue(q); spin_lock_irq(&q->queue_lock); for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { if (*cur == rqos) { *cur = rqos->next; break; } } spin_unlock_irq(&q->queue_lock); blk_mq_unfreeze_queue(q); blk_mq_debugfs_unregister_rqos(rqos); } typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data); void rq_qos_wait(struct rq_wait *rqw, void *private_data, acquire_inflight_cb_t *acquire_inflight_cb, cleanup_cb_t *cleanup_cb); bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); bool rq_depth_scale_up(struct rq_depth *rqd); bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); bool rq_depth_calc_max_depth(struct rq_depth *rqd); void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio); void __rq_qos_done(struct rq_qos *rqos, struct request *rq); void __rq_qos_issue(struct rq_qos *rqos, struct request *rq); void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq); void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio); void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio); void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio); void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio); void __rq_qos_queue_depth_changed(struct rq_qos *rqos); static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) { if (q->rq_qos) __rq_qos_cleanup(q->rq_qos, bio); } static inline void rq_qos_done(struct request_queue *q, struct request *rq) { if (q->rq_qos) __rq_qos_done(q->rq_qos, rq); } static inline void rq_qos_issue(struct request_queue *q, struct request *rq) { if (q->rq_qos) __rq_qos_issue(q->rq_qos, rq); } static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) { if (q->rq_qos) __rq_qos_requeue(q->rq_qos, rq); } static inline void rq_qos_done_bio(struct bio *bio) { if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || bio_flagged(bio, BIO_QOS_MERGED))) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); if (q->rq_qos) __rq_qos_done_bio(q->rq_qos, bio); } } static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { if (q->rq_qos) { bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); } } static inline void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) { if (q->rq_qos) __rq_qos_track(q->rq_qos, rq, bio); } static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { if (q->rq_qos) { bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); } } static inline void rq_qos_queue_depth_changed(struct request_queue *q) { if (q->rq_qos) __rq_qos_queue_depth_changed(q->rq_qos); } void rq_qos_exit(struct request_queue *); #endif
96 95 95 163 164 164 9 9 9 9 9 14 14 14 14 14 20 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> #include <net/dst_metadata.h> #include <net/net_namespace.h> #include <net/udp.h> #include <net/udp_tunnel.h> int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { int err; struct socket *sock = NULL; struct sockaddr_in udp_addr; err = sock_create_kern(net, AF_INET, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; if (cfg->bind_ifindex) { err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; err = kernel_bind(sock, (struct sockaddr *)&udp_addr, sizeof(udp_addr)); if (err < 0) goto error; if (cfg->peer_udp_port) { udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->peer_ip; udp_addr.sin_port = cfg->peer_udp_port; err = kernel_connect(sock, (struct sockaddr *)&udp_addr, sizeof(udp_addr), 0); if (err < 0) goto error; } sock->sk->sk_no_check_tx = !cfg->use_udp_checksums; *sockp = sock; return 0; error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } *sockp = NULL; return err; } EXPORT_SYMBOL(udp_sock_create4); void setup_udp_tunnel_sock(struct net *net, struct socket *sock, struct udp_tunnel_sock_cfg *cfg) { struct sock *sk = sock->sk; /* Disable multicast loopback */ inet_sk(sk)->mc_loop = 0; /* Enable CHECKSUM_UNNECESSARY to CHECKSUM_COMPLETE conversion */ inet_inc_convert_csum(sk); rcu_assign_sk_user_data(sk, cfg->sk_user_data); udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv; udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; udp_sk(sk)->gro_complete = cfg->gro_complete; udp_tunnel_encap_enable(sock); } EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock); void udp_tunnel_push_rx_port(struct net_device *dev, struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct udp_tunnel_info ti; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; udp_tunnel_nic_add_port(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_push_rx_port); void udp_tunnel_drop_rx_port(struct net_device *dev, struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct udp_tunnel_info ti; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; udp_tunnel_nic_del_port(dev, &ti); } EXPORT_SYMBOL_GPL(udp_tunnel_drop_rx_port); /* Notify netdevs that UDP port started listening */ void udp_tunnel_notify_add_rx_port(struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct udp_tunnel_info ti; struct net_device *dev; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; rcu_read_lock(); for_each_netdev_rcu(net, dev) { udp_tunnel_nic_add_port(dev, &ti); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(udp_tunnel_notify_add_rx_port); /* Notify netdevs that UDP port is no more listening */ void udp_tunnel_notify_del_rx_port(struct socket *sock, unsigned short type) { struct sock *sk = sock->sk; struct net *net = sock_net(sk); struct udp_tunnel_info ti; struct net_device *dev; ti.type = type; ti.sa_family = sk->sk_family; ti.port = inet_sk(sk)->inet_sport; rcu_read_lock(); for_each_netdev_rcu(net, dev) { udp_tunnel_nic_del_port(dev, &ti); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(udp_tunnel_notify_del_rx_port); void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, bool xnet, bool nocheck) { struct udphdr *uh; __skb_push(skb, sizeof(*uh)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->dest = dst_port; uh->source = src_port; uh->len = htons(skb->len); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); udp_set_csum(nocheck, skb, src, dst, skb->len); iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet); } EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb); void udp_tunnel_sock_release(struct socket *sock) { rcu_assign_sk_user_data(sock->sk, NULL); synchronize_rcu(); kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } EXPORT_SYMBOL_GPL(udp_tunnel_sock_release); struct metadata_dst *udp_tun_rx_dst(struct sk_buff *skb, unsigned short family, __be16 flags, __be64 tunnel_id, int md_size) { struct metadata_dst *tun_dst; struct ip_tunnel_info *info; if (family == AF_INET) tun_dst = ip_tun_rx_dst(skb, flags, tunnel_id, md_size); else tun_dst = ipv6_tun_rx_dst(skb, flags, tunnel_id, md_size); if (!tun_dst) return NULL; info = &tun_dst->u.tun_info; info->key.tp_src = udp_hdr(skb)->source; info->key.tp_dst = udp_hdr(skb)->dest; if (udp_hdr(skb)->check) info->key.tun_flags |= TUNNEL_CSUM; return tun_dst; } EXPORT_SYMBOL_GPL(udp_tun_rx_dst); MODULE_LICENSE("GPL");
3176 1084 2719 713 78 382 541 5 29 6 161 2220 904 11 11 39 100 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RCULIST_H #define _LINUX_RCULIST_H #ifdef __KERNEL__ /* * RCU-protected list version */ #include <linux/list.h> #include <linux/rcupdate.h> /* * INIT_LIST_HEAD_RCU - Initialize a list_head visible to RCU readers * @list: list to be initialized * * You should instead use INIT_LIST_HEAD() for normal initialization and * cleanup tasks, when readers have no access to the list being initialized. * However, if the list being initialized is visible to readers, you * need to keep the compiler from being too mischievous. */ static inline void INIT_LIST_HEAD_RCU(struct list_head *list) { WRITE_ONCE(list->next, list); WRITE_ONCE(list->prev, list); } /* * return the ->next pointer of a list_head in an rcu safe * way, we must not access it directly */ #define list_next_rcu(list) (*((struct list_head __rcu **)(&(list)->next))) /** * list_tail_rcu - returns the prev pointer of the head of the list * @head: the head of the list * * Note: This should only be used with the list header, and even then * only if list_del() and similar primitives are not also used on the * list header. */ #define list_tail_rcu(head) (*((struct list_head __rcu **)(&(head)->prev))) /* * Check during list traversal that we are within an RCU reader */ #define check_arg_count_one(dummy) #ifdef CONFIG_PROVE_RCU_LIST #define __list_check_rcu(dummy, cond, extra...) \ ({ \ check_arg_count_one(extra); \ RCU_LOCKDEP_WARN(!(cond) && !rcu_read_lock_any_held(), \ "RCU-list traversed in non-reader section!"); \ }) #define __list_check_srcu(cond) \ ({ \ RCU_LOCKDEP_WARN(!(cond), \ "RCU-list traversed without holding the required lock!");\ }) #else #define __list_check_rcu(dummy, cond, extra...) \ ({ check_arg_count_one(extra); }) #define __list_check_srcu(cond) ({ }) #endif /* * Insert a new entry between two known consecutive entries. * * This is only for internal list manipulation where we know * the prev/next entries already! */ static inline void __list_add_rcu(struct list_head *new, struct list_head *prev, struct list_head *next) { if (!__list_add_valid(new, prev, next)) return; new->next = next; new->prev = prev; rcu_assign_pointer(list_next_rcu(prev), new); next->prev = new; } /** * list_add_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it after * * Insert a new entry after the specified head. * This is good for implementing stacks. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head, head->next); } /** * list_add_tail_rcu - add a new entry to rcu-protected list * @new: new entry to be added * @head: list head to add it before * * Insert a new entry before the specified head. * This is useful for implementing queues. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_add_tail_rcu() * or list_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). */ static inline void list_add_tail_rcu(struct list_head *new, struct list_head *head) { __list_add_rcu(new, head->prev, head); } /** * list_del_rcu - deletes entry from list without re-initialization * @entry: the element to delete from the list. * * Note: list_empty() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as list_del_rcu() * or list_add_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * list_for_each_entry_rcu(). * * Note that the caller is not permitted to immediately free * the newly deleted entry. Instead, either synchronize_rcu() * or call_rcu() must be used to defer freeing until an RCU * grace period has elapsed. */ static inline void list_del_rcu(struct list_head *entry) { __list_del_entry(entry); entry->prev = LIST_POISON2; } /** * hlist_del_init_rcu - deletes entry from hash list with re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on the node return true after this. It is * useful for RCU based read lockfree traversal if the writer side * must know if the list entry is still hashed or already unhashed. * * In particular, it means that we can not poison the forward pointers * that may still be used for walking the hash list and we can only * zero the pprev pointer so list_unhashed() will return true after * this. * * The caller must take whatever precautions are necessary (such as * holding appropriate locks) to avoid racing with another * list-mutation primitive, such as hlist_add_head_rcu() or * hlist_del_rcu(), running on this same list. However, it is * perfectly legal to run concurrently with the _rcu list-traversal * primitives, such as hlist_for_each_entry_rcu(). */ static inline void hlist_del_init_rcu(struct hlist_node *n) { if (!hlist_unhashed(n)) { __hlist_del(n); WRITE_ONCE(n->pprev, NULL); } } /** * list_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The @old entry will be replaced with the @new entry atomically. * Note: @old should not be empty. */ static inline void list_replace_rcu(struct list_head *old, struct list_head *new) { new->next = old->next; new->prev = old->prev; rcu_assign_pointer(list_next_rcu(new->prev), new); new->next->prev = new; old->prev = LIST_POISON2; } /** * __list_splice_init_rcu - join an RCU-protected list into an existing list. * @list: the RCU-protected list to splice * @prev: points to the last element of the existing list * @next: points to the first element of the existing list * @sync: synchronize_rcu, synchronize_rcu_expedited, ... * * The list pointed to by @prev and @next can be RCU-read traversed * concurrently with this function. * * Note that this function blocks. * * Important note: the caller must take whatever action is necessary to prevent * any other updates to the existing list. In principle, it is possible to * modify the list as soon as sync() begins execution. If this sort of thing * becomes necessary, an alternative version based on call_rcu() could be * created. But only if -really- needed -- there is no shortage of RCU API * members. */ static inline void __list_splice_init_rcu(struct list_head *list, struct list_head *prev, struct list_head *next, void (*sync)(void)) { struct list_head *first = list->next; struct list_head *last = list->prev; /* * "first" and "last" tracking list, so initialize it. RCU readers * have access to this list, so we must use INIT_LIST_HEAD_RCU() * instead of INIT_LIST_HEAD(). */ INIT_LIST_HEAD_RCU(list); /* * At this point, the list body still points to the source list. * Wait for any readers to finish using the list before splicing * the list body into the new list. Any new readers will see * an empty list. */ sync(); ASSERT_EXCLUSIVE_ACCESS(*first); ASSERT_EXCLUSIVE_ACCESS(*last); /* * Readers are finished with the source list, so perform splice. * The order is important if the new list is global and accessible * to concurrent RCU readers. Note that RCU readers are not * permitted to traverse the prev pointers without excluding * this function. */ last->next = next; rcu_assign_pointer(list_next_rcu(prev), first); first->prev = prev; next->prev = last; } /** * list_splice_init_rcu - splice an RCU-protected list into an existing list, * designed for stacks. * @list: the RCU-protected list to splice * @head: the place in the existing list to splice the first list into * @sync: synchronize_rcu, synchronize_rcu_expedited, ... */ static inline void list_splice_init_rcu(struct list_head *list, struct list_head *head, void (*sync)(void)) { if (!list_empty(list)) __list_splice_init_rcu(list, head, head->next, sync); } /** * list_splice_tail_init_rcu - splice an RCU-protected list into an existing * list, designed for queues. * @list: the RCU-protected list to splice * @head: the place in the existing list to splice the first list into * @sync: synchronize_rcu, synchronize_rcu_expedited, ... */ static inline void list_splice_tail_init_rcu(struct list_head *list, struct list_head *head, void (*sync)(void)) { if (!list_empty(list)) __list_splice_init_rcu(list, head->prev, head, sync); } /** * list_entry_rcu - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_entry_rcu(ptr, type, member) \ container_of(READ_ONCE(ptr), type, member) /* * Where are list_empty_rcu() and list_first_entry_rcu()? * * They do not exist because they would lead to subtle race conditions: * * if (!list_empty_rcu(mylist)) { * struct foo *bar = list_first_entry_rcu(mylist, struct foo, list_member); * do_something(bar); * } * * The list might be non-empty when list_empty_rcu() checks it, but it * might have become empty by the time that list_first_entry_rcu() rereads * the ->next pointer, which would result in a SEGV. * * When not using RCU, it is OK for list_first_entry() to re-read that * pointer because both functions should be protected by some lock that * blocks writers. * * When using RCU, list_empty() uses READ_ONCE() to fetch the * RCU-protected ->next pointer and then compares it to the address of the * list head. However, it neither dereferences this pointer nor provides * this pointer to its caller. Thus, READ_ONCE() suffices (that is, * rcu_dereference() is not needed), which means that list_empty() can be * used anywhere you would want to use list_empty_rcu(). Just don't * expect anything useful to happen if you do a subsequent lockless * call to list_first_entry_rcu()!!! * * See list_first_or_null_rcu for an alternative. */ /** * list_first_or_null_rcu - get the first element from a list * @ptr: the list head to take the element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the list is empty, it returns NULL. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_first_or_null_rcu(ptr, type, member) \ ({ \ struct list_head *__ptr = (ptr); \ struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__ptr != __next) ? list_entry_rcu(__next, type, member) : NULL; \ }) /** * list_next_or_null_rcu - get the first element from a list * @head: the head for the list. * @ptr: the list head to take the next element from. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * Note that if the ptr is at the end of the list, NULL is returned. * * This primitive may safely run concurrently with the _rcu list-mutation * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). */ #define list_next_or_null_rcu(head, ptr, type, member) \ ({ \ struct list_head *__head = (head); \ struct list_head *__ptr = (ptr); \ struct list_head *__next = READ_ONCE(__ptr->next); \ likely(__next != __head) ? list_entry_rcu(__next, type, \ member) : NULL; \ }) /** * list_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_entry_rcu(pos, head, member, cond...) \ for (__list_check_rcu(dummy, ## cond, 0), \ pos = list_entry_rcu((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_srcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * @cond: lockdep expression for the lock required to traverse the list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as list_add_rcu() * as long as the traversal is guarded by srcu_read_lock(). * The lockdep expression srcu_read_lock_held() can be passed as the * cond argument from read side. */ #define list_for_each_entry_srcu(pos, head, member, cond) \ for (__list_check_srcu(cond), \ pos = list_entry_rcu((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_entry_lockless - get the struct for this entry * @ptr: the &struct list_head pointer. * @type: the type of the struct this is embedded in. * @member: the name of the list_head within the struct. * * This primitive may safely run concurrently with the _rcu * list-mutation primitives such as list_add_rcu(), but requires some * implicit RCU read-side guarding. One example is running within a special * exception-time environment where preemption is disabled and where lockdep * cannot be invoked. Another example is when items are added to the list, * but never deleted. */ #define list_entry_lockless(ptr, type, member) \ container_of((typeof(ptr))READ_ONCE(ptr), type, member) /** * list_for_each_entry_lockless - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_struct within the struct. * * This primitive may safely run concurrently with the _rcu * list-mutation primitives such as list_add_rcu(), but requires some * implicit RCU read-side guarding. One example is running within a special * exception-time environment where preemption is disabled and where lockdep * cannot be invoked. Another example is when items are added to the list, * but never deleted. */ #define list_for_each_entry_lockless(pos, head, member) \ for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_lockless(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_continue_rcu - continue iteration over list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_head within the struct. * * Continue to iterate over list of given type, continuing after * the current position which must have been in the list when the RCU read * lock was taken. * This would typically require either that you obtained the node from a * previous walk of the list in the same RCU read-side critical section, or * that you held some sort of non-RCU reference (such as a reference count) * to keep the node alive *and* in the list. * * This iterator is similar to list_for_each_entry_from_rcu() except * this starts after the given position and that one starts at the given * position. */ #define list_for_each_entry_continue_rcu(pos, head, member) \ for (pos = list_entry_rcu(pos->member.next, typeof(*pos), member); \ &pos->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** * list_for_each_entry_from_rcu - iterate over a list from current point * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the list_node within the struct. * * Iterate over the tail of a list starting from a given position, * which must have been in the list when the RCU read lock was taken. * This would typically require either that you obtained the node from a * previous walk of the list in the same RCU read-side critical section, or * that you held some sort of non-RCU reference (such as a reference count) * to keep the node alive *and* in the list. * * This iterator is similar to list_for_each_entry_continue_rcu() except * this starts from the given position and that one starts from the position * after the given position. */ #define list_for_each_entry_from_rcu(pos, head, member) \ for (; &(pos)->member != (head); \ pos = list_entry_rcu(pos->member.next, typeof(*(pos)), member)) /** * hlist_del_rcu - deletes entry from hash list without re-initialization * @n: the element to delete from the hash list. * * Note: list_unhashed() on entry does not return true after this, * the entry is in an undefined state. It is useful for RCU based * lockfree traversal. * * In particular, it means that we can not poison the forward * pointers that may still be used for walking the hash list. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry(). */ static inline void hlist_del_rcu(struct hlist_node *n) { __hlist_del(n); WRITE_ONCE(n->pprev, LIST_POISON2); } /** * hlist_replace_rcu - replace old entry by new one * @old : the element to be replaced * @new : the new element to insert * * The @old entry will be replaced with the @new entry atomically. */ static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new) { struct hlist_node *next = old->next; new->next = next; WRITE_ONCE(new->pprev, old->pprev); rcu_assign_pointer(*(struct hlist_node __rcu **)new->pprev, new); if (next) WRITE_ONCE(new->next->pprev, &new->next); WRITE_ONCE(old->pprev, LIST_POISON2); } /** * hlists_swap_heads_rcu - swap the lists the hlist heads point to * @left: The hlist head on the left * @right: The hlist head on the right * * The lists start out as [@left ][node1 ... ] and * [@right ][node2 ... ] * The lists end up as [@left ][node2 ... ] * [@right ][node1 ... ] */ static inline void hlists_swap_heads_rcu(struct hlist_head *left, struct hlist_head *right) { struct hlist_node *node1 = left->first; struct hlist_node *node2 = right->first; rcu_assign_pointer(left->first, node2); rcu_assign_pointer(right->first, node1); WRITE_ONCE(node2->pprev, &left->first); WRITE_ONCE(node1->pprev, &right->first); } /* * return the first or the next element in an RCU protected hlist */ #define hlist_first_rcu(head) (*((struct hlist_node __rcu **)(&(head)->first))) #define hlist_next_rcu(node) (*((struct hlist_node __rcu **)(&(node)->next))) #define hlist_pprev_rcu(node) (*((struct hlist_node __rcu **)((node)->pprev))) /** * hlist_add_head_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_head_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *first = h->first; n->next = first; WRITE_ONCE(n->pprev, &h->first); rcu_assign_pointer(hlist_first_rcu(h), n); if (first) WRITE_ONCE(first->pprev, &n->next); } /** * hlist_add_tail_rcu * @n: the element to add to the hash list. * @h: the list to add to. * * Description: * Adds the specified element to the specified hlist, * while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. Regardless of the type of CPU, the * list-traversal primitive must be guarded by rcu_read_lock(). */ static inline void hlist_add_tail_rcu(struct hlist_node *n, struct hlist_head *h) { struct hlist_node *i, *last = NULL; /* Note: write side code, so rcu accessors are not needed. */ for (i = h->first; i; i = i->next) last = i; if (last) { n->next = last->next; WRITE_ONCE(n->pprev, &last->next); rcu_assign_pointer(hlist_next_rcu(last), n); } else { hlist_add_head_rcu(n, h); } } /** * hlist_add_before_rcu * @n: the new element to add to the hash list. * @next: the existing element to add the new element before. * * Description: * Adds the specified element to the specified hlist * before the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_before_rcu(struct hlist_node *n, struct hlist_node *next) { WRITE_ONCE(n->pprev, next->pprev); n->next = next; rcu_assign_pointer(hlist_pprev_rcu(n), n); WRITE_ONCE(next->pprev, &n->next); } /** * hlist_add_behind_rcu * @n: the new element to add to the hash list. * @prev: the existing element to add the new element after. * * Description: * Adds the specified element to the specified hlist * after the specified node while permitting racing traversals. * * The caller must take whatever precautions are necessary * (such as holding appropriate locks) to avoid racing * with another list-mutation primitive, such as hlist_add_head_rcu() * or hlist_del_rcu(), running on this same list. * However, it is perfectly legal to run concurrently with * the _rcu list-traversal primitives, such as * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ static inline void hlist_add_behind_rcu(struct hlist_node *n, struct hlist_node *prev) { n->next = prev->next; WRITE_ONCE(n->pprev, &prev->next); rcu_assign_pointer(hlist_next_rcu(prev), n); if (n->next) WRITE_ONCE(n->next->pprev, &n->next); } #define __hlist_for_each_rcu(pos, head) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos; \ pos = rcu_dereference(hlist_next_rcu(pos))) /** * hlist_for_each_entry_rcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * @cond: optional lockdep expression if called from non-RCU protection. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu(pos, head, member, cond...) \ for (__list_check_rcu(dummy, ## cond, 0), \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_srcu - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * @cond: lockdep expression for the lock required to traverse the list. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by srcu_read_lock(). * The lockdep expression srcu_read_lock_held() can be passed as the * cond argument from read side. */ #define hlist_for_each_entry_srcu(pos, head, member, cond) \ for (__list_check_srcu(cond), \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_rcu_notrace - iterate over rcu list of given type (for tracing) * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). * * This is the same as hlist_for_each_entry_rcu() except that it does * not do any RCU debugging or tracing. */ #define hlist_for_each_entry_rcu_notrace(pos, head, member) \ for (pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_rcu_bh - iterate over rcu list of given type * @pos: the type * to use as a loop cursor. * @head: the head for your list. * @member: the name of the hlist_node within the struct. * * This list-traversal primitive may safely run concurrently with * the _rcu list-mutation primitives such as hlist_add_head_rcu() * as long as the traversal is guarded by rcu_read_lock(). */ #define hlist_for_each_entry_rcu_bh(pos, head, member) \ for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_first_rcu(head)),\ typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu(\ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu(pos, member) \ for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_continue_rcu_bh - iterate over a hlist continuing after current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_continue_rcu_bh(pos, member) \ for (pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member); \ pos; \ pos = hlist_entry_safe(rcu_dereference_bh(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) /** * hlist_for_each_entry_from_rcu - iterate over a hlist continuing from current point * @pos: the type * to use as a loop cursor. * @member: the name of the hlist_node within the struct. */ #define hlist_for_each_entry_from_rcu(pos, member) \ for (; pos; \ pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu( \ &(pos)->member)), typeof(*(pos)), member)) #endif /* __KERNEL__ */ #endif
6 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/if_team.h - Network team device driver header * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com> */ #ifndef _LINUX_IF_TEAM_H_ #define _LINUX_IF_TEAM_H_ #include <linux/netpoll.h> #include <net/sch_generic.h> #include <linux/types.h> #include <uapi/linux/if_team.h> struct team_pcpu_stats { u64 rx_packets; u64 rx_bytes; u64 rx_multicast; u64 tx_packets; u64 tx_bytes; struct u64_stats_sync syncp; u32 rx_dropped; u32 tx_dropped; u32 rx_nohandler; }; struct team; struct team_port { struct net_device *dev; struct hlist_node hlist; /* node in enabled ports hash list */ struct list_head list; /* node in ordinary list */ struct team *team; int index; /* index of enabled port. If disabled, it's set to -1 */ bool linkup; /* either state.linkup or user.linkup */ struct { bool linkup; u32 speed; u8 duplex; } state; /* Values set by userspace */ struct { bool linkup; bool linkup_enabled; } user; /* Custom gennetlink interface related flags */ bool changed; bool removed; /* * A place for storing original values of the device before it * become a port. */ struct { unsigned char dev_addr[MAX_ADDR_LEN]; unsigned int mtu; } orig; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif s32 priority; /* lower number ~ higher priority */ u16 queue_id; struct list_head qom_list; /* node in queue override mapping list */ struct rcu_head rcu; long mode_priv[]; }; static inline struct team_port *team_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static inline bool team_port_enabled(struct team_port *port) { return port->index != -1; } static inline bool team_port_txable(struct team_port *port) { return port->linkup && team_port_enabled(port); } static inline bool team_port_dev_txable(const struct net_device *port_dev) { struct team_port *port; bool txable; rcu_read_lock(); port = team_port_get_rcu(port_dev); txable = port ? team_port_txable(port) : false; rcu_read_unlock(); return txable; } #ifdef CONFIG_NET_POLL_CONTROLLER static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { netpoll_send_skb(port->np, skb); } #else static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { } #endif struct team_mode_ops { int (*init)(struct team *team); void (*exit)(struct team *team); rx_handler_result_t (*receive)(struct team *team, struct team_port *port, struct sk_buff *skb); bool (*transmit)(struct team *team, struct sk_buff *skb); int (*port_enter)(struct team *team, struct team_port *port); void (*port_leave)(struct team *team, struct team_port *port); void (*port_change_dev_addr)(struct team *team, struct team_port *port); void (*port_enabled)(struct team *team, struct team_port *port); void (*port_disabled)(struct team *team, struct team_port *port); }; extern int team_modeop_port_enter(struct team *team, struct team_port *port); extern void team_modeop_port_change_dev_addr(struct team *team, struct team_port *port); enum team_option_type { TEAM_OPTION_TYPE_U32, TEAM_OPTION_TYPE_STRING, TEAM_OPTION_TYPE_BINARY, TEAM_OPTION_TYPE_BOOL, TEAM_OPTION_TYPE_S32, }; struct team_option_inst_info { u32 array_index; struct team_port *port; /* != NULL if per-port */ }; struct team_gsetter_ctx { union { u32 u32_val; const char *str_val; struct { const void *ptr; u32 len; } bin_val; bool bool_val; s32 s32_val; } data; struct team_option_inst_info *info; }; struct team_option { struct list_head list; const char *name; bool per_port; unsigned int array_size; /* != 0 means the option is array */ enum team_option_type type; int (*init)(struct team *team, struct team_option_inst_info *info); int (*getter)(struct team *team, struct team_gsetter_ctx *ctx); int (*setter)(struct team *team, struct team_gsetter_ctx *ctx); }; extern void team_option_inst_set_change(struct team_option_inst_info *opt_inst_info); extern void team_options_change_check(struct team *team); struct team_mode { const char *kind; struct module *owner; size_t priv_size; size_t port_priv_size; const struct team_mode_ops *ops; enum netdev_lag_tx_type lag_tx_type; }; #define TEAM_PORT_HASHBITS 4 #define TEAM_PORT_HASHENTRIES (1 << TEAM_PORT_HASHBITS) #define TEAM_MODE_PRIV_LONGS 4 #define TEAM_MODE_PRIV_SIZE (sizeof(long) * TEAM_MODE_PRIV_LONGS) struct team { struct net_device *dev; /* associated netdevice */ struct team_pcpu_stats __percpu *pcpu_stats; const struct header_ops *header_ops_cache; struct mutex lock; /* used for overall locking, e.g. port lists write */ /* * List of enabled ports and their count */ int en_port_count; struct hlist_head en_port_hlist[TEAM_PORT_HASHENTRIES]; struct list_head port_list; /* list of all ports */ struct list_head option_list; struct list_head option_inst_list; /* list of option instances */ const struct team_mode *mode; struct team_mode_ops ops; bool user_carrier_enabled; bool queue_override_enabled; struct list_head *qom_lists; /* array of queue override mapping lists */ bool port_mtu_change_allowed; bool notifier_ctx; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } notify_peers; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } mcast_rejoin; struct lock_class_key team_lock_key; long mode_priv[TEAM_MODE_PRIV_LONGS]; }; static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, struct sk_buff *skb) { BUILD_BUG_ON(sizeof(skb->queue_mapping) != sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); skb->dev = port->dev; if (unlikely(netpoll_tx_running(team->dev))) { team_netpoll_send_skb(port, skb); return 0; } return dev_queue_xmit(skb); } static inline struct hlist_head *team_port_index_hash(struct team *team, int port_index) { return &team->en_port_hlist[port_index & (TEAM_PORT_HASHENTRIES - 1)]; } static inline struct team_port *team_get_port_by_index(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline int team_num_to_port_index(struct team *team, unsigned int num) { int en_port_count = READ_ONCE(team->en_port_count); if (unlikely(!en_port_count)) return 0; return num % en_port_count; } static inline struct team_port *team_get_port_by_index_rcu(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry_rcu(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline struct team_port * team_get_first_port_txable_rcu(struct team *team, struct team_port *port) { struct team_port *cur; if (likely(team_port_txable(port))) return port; cur = port; list_for_each_entry_continue_rcu(cur, &team->port_list, list) if (team_port_txable(cur)) return cur; list_for_each_entry_rcu(cur, &team->port_list, list) { if (cur == port) break; if (team_port_txable(cur)) return cur; } return NULL; } extern int team_options_register(struct team *team, const struct team_option *option, size_t option_count); extern void team_options_unregister(struct team *team, const struct team_option *option, size_t option_count); extern int team_mode_register(const struct team_mode *mode); extern void team_mode_unregister(const struct team_mode *mode); #define TEAM_DEFAULT_NUM_TX_QUEUES 16 #define TEAM_DEFAULT_NUM_RX_QUEUES 16 #define MODULE_ALIAS_TEAM_MODE(kind) MODULE_ALIAS("team-mode-" kind) #endif /* _LINUX_IF_TEAM_H_ */
1 1 3 63 64 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 // SPDX-License-Identifier: GPL-2.0-or-later /* * DCCP over IPv6 * Linux INET6 implementation * * Based on net/dccp6/ipv6.c * * Arnaldo Carvalho de Melo <acme@ghostprotocols.net> */ #include <linux/module.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/xfrm.h> #include <linux/string.h> #include <net/addrconf.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> #include <net/inet_sock.h> #include <net/inet6_connection_sock.h> #include <net/inet6_hashtables.h> #include <net/ip6_route.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_checksum.h> #include <net/xfrm.h> #include <net/secure_seq.h> #include <net/netns/generic.h> #include <net/sock.h> #include "dccp.h" #include "ipv6.h" #include "feat.h" struct dccp_v6_pernet { struct sock *v6_ctl_sk; }; static unsigned int dccp_v6_pernet_id __read_mostly; /* The per-net v6_ctl_sk is used for sending RSTs and ACKs */ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped; static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops; /* add pseudo-header to DCCP checksum stored in skb->csum */ static inline __sum16 dccp_v6_csum_finish(struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr) { return csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_DCCP, skb->csum); } static inline void dccp_v6_send_check(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); struct dccp_hdr *dh = dccp_hdr(skb); dccp_csum_outgoing(skb); dh->dccph_checksum = dccp_v6_csum_finish(skb, &np->saddr, &sk->sk_v6_daddr); } static inline __u64 dccp_v6_init_sequence(struct sk_buff *skb) { return secure_dccpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32, dccp_hdr(skb)->dccph_dport, dccp_hdr(skb)->dccph_sport ); } static int dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr; const struct dccp_hdr *dh; struct dccp_sock *dp; struct ipv6_pinfo *np; struct sock *sk; int err; __u64 seq; struct net *net = dev_net(skb->dev); if (!pskb_may_pull(skb, offset + sizeof(*dh))) return -EINVAL; dh = (struct dccp_hdr *)(skb->data + offset); if (!pskb_may_pull(skb, offset + __dccp_basic_hdr_len(dh))) return -EINVAL; hdr = (const struct ipv6hdr *)skb->data; dh = (struct dccp_hdr *)(skb->data + offset); sk = __inet6_lookup_established(net, &dccp_hashinfo, &hdr->daddr, dh->dccph_dport, &hdr->saddr, ntohs(dh->dccph_sport), inet6_iif(skb), 0); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return -ENOENT; } if (sk->sk_state == DCCP_TIME_WAIT) { inet_twsk_put(inet_twsk(sk)); return 0; } seq = dccp_hdr_seq(dh); if (sk->sk_state == DCCP_NEW_SYN_RECV) { dccp_req_err(sk, seq); return 0; } bh_lock_sock(sk); if (sock_owned_by_user(sk)) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == DCCP_CLOSED) goto out; dp = dccp_sk(sk); if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_LISTEN) && !between48(seq, dp->dccps_awl, dp->dccps_awh)) { __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } np = inet6_sk(sk); if (type == NDISC_REDIRECT) { if (!sock_owned_by_user(sk)) { struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); if (dst) dst->ops->redirect(dst, sk, skb); } goto out; } if (type == ICMPV6_PKT_TOOBIG) { struct dst_entry *dst = NULL; if (!ip6_sk_accept_pmtu(sk)) goto out; if (sock_owned_by_user(sk)) goto out; if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) goto out; dst = inet6_csk_update_pmtu(sk, ntohl(info)); if (!dst) goto out; if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) dccp_sync_mss(sk, dst_mtu(dst)); goto out; } icmpv6_err_convert(type, code, &err); /* Might be for an request_sock */ switch (sk->sk_state) { case DCCP_REQUESTING: case DCCP_RESPOND: /* Cannot happen. It can, it SYNs are crossed. --ANK */ if (!sock_owned_by_user(sk)) { __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); sk->sk_err = err; /* * Wake people up to see the error * (see connect in sock.c) */ sk_error_report(sk); dccp_done(sk); } else sk->sk_err_soft = err; goto out; } if (!sock_owned_by_user(sk) && np->recverr) { sk->sk_err = err; sk_error_report(sk); } else sk->sk_err_soft = err; out: bh_unlock_sock(sk); sock_put(sk); return 0; } static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; struct in6_addr *final_p, final; struct flowi6 fl6; int err = -1; struct dst_entry *dst; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_DCCP; fl6.daddr = ireq->ir_v6_rmt_addr; fl6.saddr = ireq->ir_v6_loc_addr; fl6.flowlabel = 0; fl6.flowi6_oif = ireq->ir_iif; fl6.fl6_dport = ireq->ir_rmt_port; fl6.fl6_sport = htons(ireq->ir_num); security_req_classify_flow(req, flowi6_to_flowi_common(&fl6)); rcu_read_lock(); final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final); rcu_read_unlock(); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; goto done; } skb = dccp_make_response(sk, dst, req); if (skb != NULL) { struct dccp_hdr *dh = dccp_hdr(skb); struct ipv6_txoptions *opt; dh->dccph_checksum = dccp_v6_csum_finish(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6.daddr = ireq->ir_v6_rmt_addr; rcu_read_lock(); opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); err = ip6_xmit(sk, skb, &fl6, sk->sk_mark, opt, np->tclass, sk->sk_priority); rcu_read_unlock(); err = net_xmit_eval(err); } done: dst_release(dst); return err; } static void dccp_v6_reqsk_destructor(struct request_sock *req) { dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg); kfree(inet_rsk(req)->ipv6_opt); kfree_skb(inet_rsk(req)->pktopts); } static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb) { const struct ipv6hdr *rxip6h; struct sk_buff *skb; struct flowi6 fl6; struct net *net = dev_net(skb_dst(rxskb)->dev); struct dccp_v6_pernet *pn; struct sock *ctl_sk; struct dst_entry *dst; if (dccp_hdr(rxskb)->dccph_type == DCCP_PKT_RESET) return; if (!ipv6_unicast_destination(rxskb)) return; pn = net_generic(net, dccp_v6_pernet_id); ctl_sk = pn->v6_ctl_sk; skb = dccp_ctl_make_reset(ctl_sk, rxskb); if (skb == NULL) return; rxip6h = ipv6_hdr(rxskb); dccp_hdr(skb)->dccph_checksum = dccp_v6_csum_finish(skb, &rxip6h->saddr, &rxip6h->daddr); memset(&fl6, 0, sizeof(fl6)); fl6.daddr = rxip6h->saddr; fl6.saddr = rxip6h->daddr; fl6.flowi6_proto = IPPROTO_DCCP; fl6.flowi6_oif = inet6_iif(rxskb); fl6.fl6_dport = dccp_hdr(skb)->dccph_dport; fl6.fl6_sport = dccp_hdr(skb)->dccph_sport; security_skb_classify_flow(rxskb, flowi6_to_flowi_common(&fl6)); /* sk = NULL, but it is safe for now. RST socket required. */ dst = ip6_dst_lookup_flow(sock_net(ctl_sk), ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(skb, dst); ip6_xmit(ctl_sk, skb, &fl6, 0, NULL, 0, 0); DCCP_INC_STATS(DCCP_MIB_OUTSEGS); DCCP_INC_STATS(DCCP_MIB_OUTRSTS); return; } kfree_skb(skb); } static struct request_sock_ops dccp6_request_sock_ops = { .family = AF_INET6, .obj_size = sizeof(struct dccp6_request_sock), .rtx_syn_ack = dccp_v6_send_response, .send_ack = dccp_reqsk_send_ack, .destructor = dccp_v6_reqsk_destructor, .send_reset = dccp_v6_ctl_send_reset, .syn_ack_timeout = dccp_syn_ack_timeout, }; static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { struct request_sock *req; struct dccp_request_sock *dreq; struct inet_request_sock *ireq; struct ipv6_pinfo *np = inet6_sk(sk); const __be32 service = dccp_hdr_request(skb)->dccph_req_service; struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); if (skb->protocol == htons(ETH_P_IP)) return dccp_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) return 0; /* discard, don't send a reset here */ if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); return 0; } if (dccp_bad_service_code(sk, service)) { dcb->dccpd_reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; goto drop; } /* * There are no SYN attacks on IPv6, yet... */ dcb->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY; if (inet_csk_reqsk_queue_is_full(sk)) goto drop; if (sk_acceptq_is_full(sk)) goto drop; req = inet_reqsk_alloc(&dccp6_request_sock_ops, sk, true); if (req == NULL) goto drop; if (dccp_reqsk_init(req, dccp_sk(sk), skb)) goto drop_and_free; dreq = dccp_rsk(req); if (dccp_parse_options(sk, dreq, skb)) goto drop_and_free; ireq = inet_rsk(req); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; ireq->ireq_family = AF_INET6; ireq->ir_mark = inet_request_mark(sk, skb); if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { refcount_inc(&skb->users); ireq->pktopts = skb; } ireq->ir_iif = sk->sk_bound_dev_if; /* So that link locals have meaning */ if (!sk->sk_bound_dev_if && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = inet6_iif(skb); /* * Step 3: Process LISTEN state * * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie * * Setting S.SWL/S.SWH to is deferred to dccp_create_openreq_child(). */ dreq->dreq_isr = dcb->dccpd_seq; dreq->dreq_gsr = dreq->dreq_isr; dreq->dreq_iss = dccp_v6_init_sequence(skb); dreq->dreq_gss = dreq->dreq_iss; dreq->dreq_service = service; if (dccp_v6_send_response(sk, req)) goto drop_and_free; if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT))) reqsk_free(req); else reqsk_put(req); return 0; drop_and_free: reqsk_free(req); drop: __DCCP_INC_STATS(DCCP_MIB_ATTEMPTFAILS); return -1; } static struct sock *dccp_v6_request_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct inet_request_sock *ireq = inet_rsk(req); struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_txoptions *opt; struct inet_sock *newinet; struct dccp6_sock *newdp6; struct sock *newsk; if (skb->protocol == htons(ETH_P_IP)) { /* * v6 mapped */ newsk = dccp_v4_request_recv_sock(sk, skb, req, dst, req_unhash, own_req); if (newsk == NULL) return NULL; newdp6 = (struct dccp6_sock *)newsk; newinet = inet_sk(newsk); newinet->pinet6 = &newdp6->inet6; newnp = inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; newsk->sk_backlog_rcv = dccp_v4_do_rcv; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; newnp->mcast_oif = inet_iif(skb); newnp->mcast_hops = ip_hdr(skb)->ttl; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count * here, dccp_create_openreq_child now does this for us, see the comment in * that function for the gory details. -acme */ /* It is tricky place. Until this moment IPv4 tcp worked with IPv6 icsk.icsk_af_ops. Sync it now. */ dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } if (sk_acceptq_is_full(sk)) goto out_overflow; if (!dst) { struct flowi6 fl6; dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_DCCP); if (!dst) goto out; } newsk = dccp_create_openreq_child(sk, req, skb); if (newsk == NULL) goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks * count here, dccp_create_openreq_child now does this for us, see the * comment in that function for the gory details. -acme */ ip6_dst_store(newsk, dst, NULL, NULL); newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); newdp6 = (struct dccp6_sock *)newsk; newinet = inet_sk(newsk); newinet->pinet6 = &newdp6->inet6; newnp = inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; newnp->saddr = ireq->ir_v6_loc_addr; newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; /* Now IPv6 options... First: no IPv4 options. */ newinet->inet_opt = NULL; /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; /* * Clone native IPv6 options from listening socket (if any) * * Yes, keeping reference count would be much more clever, but we make * one more one thing there: reattach optmem to newsk. */ opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); } inet_csk(newsk)->icsk_ext_hdr_len = 0; if (opt) inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + opt->opt_flen; dccp_sync_mss(newsk, dst_mtu(dst)); newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6; if (__inet_inherit_port(sk, newsk) < 0) { inet_csk_prepare_forced_close(newsk); dccp_done(newsk); goto out; } *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), NULL); /* Clone pktoptions received with SYN, if we own the req */ if (*own_req && ireq->pktopts) { newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; } return newsk; out_overflow: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); out_nonewsk: dst_release(dst); out: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; } /* The socket must have it's spinlock held when we get * here. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held. */ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *opt_skb = NULL; /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. From backlog it always goes here. Kerboom... Fortunately, dccp_rcv_established and rcv_established handle them correctly, but it is not case with dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK */ if (skb->protocol == htons(ETH_P_IP)) return dccp_v4_do_rcv(sk, skb); if (sk_filter(sk, skb)) goto discard; /* * socket locking is here for SMP purposes as backlog rcv is currently * called with bh processing disabled. */ /* Do Stevens' IPV6_PKTOPTIONS. Yes, guys, it is the only place in our code, where we may make it not affecting IPv4. The rest of code is protocol independent, and I do not like idea to uglify IPv4. Actually, all the idea behind IPV6_PKTOPTIONS looks not very well thought. For now we latch options, received in the last packet, enqueued by tcp. Feel free to propose better solution. --ANK (980728) */ if (np->rxopt.all && sk->sk_state != DCCP_LISTEN) opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == DCCP_OPEN) { /* Fast path */ if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) goto reset; if (opt_skb) goto ipv6_pktoptions; return 0; } /* * Step 3: Process LISTEN state * If S.state == LISTEN, * If P.type == Request or P contains a valid Init Cookie option, * (* Must scan the packet's options to check for Init * Cookies. Only Init Cookies are processed here, * however; other options are processed in Step 8. This * scan need only be performed if the endpoint uses Init * Cookies *) * (* Generate a new socket and switch to that socket *) * Set S := new socket for this port pair * S.state = RESPOND * Choose S.ISS (initial seqno) or set from Init Cookies * Initialize S.GAR := S.ISS * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies * Continue with S.state == RESPOND * (* A Response packet will be generated in Step 11 *) * Otherwise, * Generate Reset(No Connection) unless P.type == Reset * Drop packet and return * * NOTE: the check for the packet types is done in * dccp_rcv_state_process */ if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) goto reset; if (opt_skb) goto ipv6_pktoptions; return 0; reset: dccp_v6_ctl_send_reset(sk, skb); discard: if (opt_skb != NULL) __kfree_skb(opt_skb); kfree_skb(skb); return 0; /* Handling IPV6_PKTOPTIONS skb the similar * way it's done for net/ipv6/tcp_ipv6.c */ ipv6_pktoptions: if (!((1 << sk->sk_state) & (DCCPF_CLOSED | DCCPF_LISTEN))) { if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) np->mcast_oif = inet6_iif(opt_skb); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) np->mcast_hops = ipv6_hdr(opt_skb)->hop_limit; if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (np->repflow) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &DCCP_SKB_CB(opt_skb)->header.h6)) { memmove(IP6CB(opt_skb), &DCCP_SKB_CB(opt_skb)->header.h6, sizeof(struct inet6_skb_parm)); opt_skb = xchg(&np->pktoptions, opt_skb); } else { __kfree_skb(opt_skb); opt_skb = xchg(&np->pktoptions, NULL); } } kfree_skb(opt_skb); return 0; } static int dccp_v6_rcv(struct sk_buff *skb) { const struct dccp_hdr *dh; bool refcounted; struct sock *sk; int min_cov; /* Step 1: Check header basics */ if (dccp_invalid_packet(skb)) goto discard_it; /* Step 1: If header checksum is incorrect, drop packet and return. */ if (dccp_v6_csum_finish(skb, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr)) { DCCP_WARN("dropped packet with invalid checksum\n"); goto discard_it; } dh = dccp_hdr(skb); DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(dh); DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; if (dccp_packet_without_ack(skb)) DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; else DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); lookup: sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh), dh->dccph_sport, dh->dccph_dport, inet6_iif(skb), 0, &refcounted); if (!sk) { dccp_pr_debug("failed to look up flow ID in table and " "get corresponding socket\n"); goto no_dccp_socket; } /* * Step 2: * ... or S.state == TIMEWAIT, * Generate Reset(No Connection) unless P.type == Reset * Drop packet and return */ if (sk->sk_state == DCCP_TIME_WAIT) { dccp_pr_debug("sk->sk_state == DCCP_TIME_WAIT: do_time_wait\n"); inet_twsk_put(inet_twsk(sk)); goto no_dccp_socket; } if (sk->sk_state == DCCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); struct sock *nsk; sk = req->rsk_listener; if (unlikely(sk->sk_state != DCCP_LISTEN)) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } sock_hold(sk); refcounted = true; nsk = dccp_check_req(sk, skb, req); if (!nsk) { reqsk_put(req); goto discard_and_relse; } if (nsk == sk) { reqsk_put(req); } else if (dccp_child_process(sk, nsk, skb)) { dccp_v6_ctl_send_reset(sk, skb); goto discard_and_relse; } else { sock_put(sk); return 0; } } /* * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * o if MinCsCov = 0, only packets with CsCov = 0 are accepted * o if MinCsCov > 0, also accept packets with CsCov >= MinCsCov */ min_cov = dccp_sk(sk)->dccps_pcrlen; if (dh->dccph_cscov && (min_cov == 0 || dh->dccph_cscov < min_cov)) { dccp_pr_debug("Packet CsCov %d does not satisfy MinCsCov %d\n", dh->dccph_cscov, min_cov); /* FIXME: send Data Dropped option (see also dccp_v4_rcv) */ goto discard_and_relse; } if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted) ? -1 : 0; no_dccp_socket: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; /* * Step 2: * If no socket ... * Generate Reset(No Connection) unless P.type == Reset * Drop packet and return */ if (dh->dccph_type != DCCP_PKT_RESET) { DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_NO_CONNECTION; dccp_v6_ctl_send_reset(sk, skb); } discard_it: kfree_skb(skb); return 0; discard_and_relse: if (refcounted) sock_put(sk); goto discard_it; } static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *)uaddr; struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct dccp_sock *dp = dccp_sk(sk); struct in6_addr *saddr = NULL, *final_p, final; struct ipv6_txoptions *opt; struct flowi6 fl6; struct dst_entry *dst; int addr_type; int err; dp->dccps_role = DCCP_ROLE_CLIENT; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; memset(&fl6, 0, sizeof(fl6)); if (np->sndflow) { fl6.flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; IP6_ECN_flow_init(fl6.flowlabel); if (fl6.flowlabel & IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); } } /* * connect() to INADDR_ANY means loopback (BSD'ism). */ if (ipv6_addr_any(&usin->sin6_addr)) usin->sin6_addr.s6_addr[15] = 1; addr_type = ipv6_addr_type(&usin->sin6_addr); if (addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH; if (addr_type & IPV6_ADDR_LINKLOCAL) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { /* If interface is set while binding, indices * must coincide. */ if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != usin->sin6_scope_id) return -EINVAL; sk->sk_bound_dev_if = usin->sin6_scope_id; } /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) return -EINVAL; } sk->sk_v6_daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; /* * DCCP over IPv4 */ if (addr_type == IPV6_ADDR_MAPPED) { u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); if (__ipv6_only_sock(sk)) return -ENETUNREACH; sin.sin_family = AF_INET; sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; icsk->icsk_af_ops = &dccp_ipv6_mapped; sk->sk_backlog_rcv = dccp_v4_do_rcv; err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { icsk->icsk_ext_hdr_len = exthdrlen; icsk->icsk_af_ops = &dccp_ipv6_af_ops; sk->sk_backlog_rcv = dccp_v6_do_rcv; goto failure; } np->saddr = sk->sk_v6_rcv_saddr; return err; } if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) saddr = &sk->sk_v6_rcv_saddr; fl6.flowi6_proto = IPPROTO_DCCP; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = saddr ? *saddr : np->saddr; fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; } if (saddr == NULL) { saddr = &fl6.saddr; sk->sk_v6_rcv_saddr = *saddr; } /* set the source address */ np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; if (opt) icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; inet->inet_dport = usin->sin6_port; dccp_set_state(sk, DCCP_REQUESTING); err = inet6_hash_connect(&dccp_death_row, sk); if (err) goto late_failure; dp->dccps_iss = secure_dccpv6_sequence_number(np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, inet->inet_sport, inet->inet_dport); err = dccp_connect(sk); if (err) goto late_failure; return 0; late_failure: dccp_set_state(sk, DCCP_CLOSED); if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) inet_reset_saddr(sk); __sk_dst_reset(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; return err; } static const struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { .queue_xmit = inet6_csk_xmit, .send_check = dccp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, .conn_request = dccp_v6_conn_request, .syn_recv_sock = dccp_v6_request_recv_sock, .net_header_len = sizeof(struct ipv6hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), }; /* * DCCP over IPv4 via INET6 API */ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = dccp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .conn_request = dccp_v6_conn_request, .syn_recv_sock = dccp_v6_request_recv_sock, .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), }; static void dccp_v6_sk_destruct(struct sock *sk) { dccp_destruct_common(sk); inet6_sock_destruct(sk); } /* NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ static int dccp_v6_init_sock(struct sock *sk) { static __u8 dccp_v6_ctl_sock_initialized; int err = dccp_init_sock(sk, dccp_v6_ctl_sock_initialized); if (err == 0) { if (unlikely(!dccp_v6_ctl_sock_initialized)) dccp_v6_ctl_sock_initialized = 1; inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; sk->sk_destruct = dccp_v6_sk_destruct; } return err; } static struct timewait_sock_ops dccp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct dccp6_timewait_sock), }; static struct proto dccp_v6_prot = { .name = "DCCPv6", .owner = THIS_MODULE, .close = dccp_close, .connect = dccp_v6_connect, .disconnect = dccp_disconnect, .ioctl = dccp_ioctl, .init = dccp_v6_init_sock, .setsockopt = dccp_setsockopt, .getsockopt = dccp_getsockopt, .sendmsg = dccp_sendmsg, .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v6_do_rcv, .hash = inet6_hash, .unhash = inet_unhash, .accept = inet_csk_accept, .get_port = inet_csk_get_port, .shutdown = dccp_shutdown, .destroy = dccp_destroy_sock, .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, .rsk_prot = &dccp6_request_sock_ops, .twsk_prot = &dccp6_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, }; static const struct inet6_protocol dccp_v6_protocol = { .handler = dccp_v6_rcv, .err_handler = dccp_v6_err, .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, }; static const struct proto_ops inet6_dccp_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet6_getname, .poll = dccp_poll, .ioctl = inet6_ioctl, .gettstamp = sock_gettstamp, .listen = inet_dccp_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static struct inet_protosw dccp_v6_protosw = { .type = SOCK_DCCP, .protocol = IPPROTO_DCCP, .prot = &dccp_v6_prot, .ops = &inet6_dccp_ops, .flags = INET_PROTOSW_ICSK, }; static int __net_init dccp_v6_init_net(struct net *net) { struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); if (dccp_hashinfo.bhash == NULL) return -ESOCKTNOSUPPORT; return inet_ctl_sock_create(&pn->v6_ctl_sk, PF_INET6, SOCK_DCCP, IPPROTO_DCCP, net); } static void __net_exit dccp_v6_exit_net(struct net *net) { struct dccp_v6_pernet *pn = net_generic(net, dccp_v6_pernet_id); inet_ctl_sock_destroy(pn->v6_ctl_sk); } static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list) { inet_twsk_purge(&dccp_hashinfo, AF_INET6); } static struct pernet_operations dccp_v6_ops = { .init = dccp_v6_init_net, .exit = dccp_v6_exit_net, .exit_batch = dccp_v6_exit_batch, .id = &dccp_v6_pernet_id, .size = sizeof(struct dccp_v6_pernet), }; static int __init dccp_v6_init(void) { int err = proto_register(&dccp_v6_prot, 1); if (err) goto out; inet6_register_protosw(&dccp_v6_protosw); err = register_pernet_subsys(&dccp_v6_ops); if (err) goto out_destroy_ctl_sock; err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP); if (err) goto out_unregister_proto; out: return err; out_unregister_proto: unregister_pernet_subsys(&dccp_v6_ops); out_destroy_ctl_sock: inet6_unregister_protosw(&dccp_v6_protosw); proto_unregister(&dccp_v6_prot); goto out; } static void __exit dccp_v6_exit(void) { inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP); unregister_pernet_subsys(&dccp_v6_ops); inet6_unregister_protosw(&dccp_v6_protosw); proto_unregister(&dccp_v6_prot); } module_init(dccp_v6_init); module_exit(dccp_v6_exit); /* * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) * values directly, Also cover the case where the protocol is not specified, * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 33, 6); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET6, 0, 6); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@mandriva.com>"); MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol");
658 271 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NF_NAT_H #define _NF_NAT_H #include <linux/list.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter/nf_conntrack_pptp.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_extend.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <uapi/linux/netfilter/nf_nat.h> enum nf_nat_manip_type { NF_NAT_MANIP_SRC, NF_NAT_MANIP_DST }; /* SRC manip occurs POST_ROUTING or LOCAL_IN */ #define HOOK2MANIP(hooknum) ((hooknum) != NF_INET_POST_ROUTING && \ (hooknum) != NF_INET_LOCAL_IN) /* per conntrack: nat application helper private data */ union nf_conntrack_nat_help { /* insert nat helper private data here */ #if IS_ENABLED(CONFIG_NF_NAT_PPTP) struct nf_nat_pptp nat_pptp_info; #endif }; /* The structure embedded in the conntrack structure. */ struct nf_conn_nat { union nf_conntrack_nat_help help; #if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE) int masq_index; #endif }; /* Set up the info structure to map into this range. */ unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype); extern unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum); struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct); static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct) { #if IS_ENABLED(CONFIG_NF_NAT) return nf_ct_ext_find(ct, NF_CT_EXT_NAT); #else return NULL; #endif } static inline bool nf_nat_oif_changed(unsigned int hooknum, enum ip_conntrack_info ctinfo, struct nf_conn_nat *nat, const struct net_device *out) { #if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE) return nat && nat->masq_index && hooknum == NF_INET_POST_ROUTING && CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL && nat->masq_index != out->ifindex; #else return false; #endif } int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *nat_ops, unsigned int ops_count); void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, unsigned int ops_count); unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb); unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir); void nf_nat_csum_recalc(struct sk_buff *skb, u8 nfproto, u8 proto, void *data, __sum16 *check, int datalen, int oldlen); int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum); int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen); int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops); void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops); int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops); void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops); int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops); void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops); unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); static inline int nf_nat_initialized(struct nf_conn *ct, enum nf_nat_manip_type manip) { if (manip == NF_NAT_MANIP_SRC) return ct->status & IPS_SRC_NAT_DONE; else return ct->status & IPS_DST_NAT_DONE; } #endif
79 63 16 79 79 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 // SPDX-License-Identifier: GPL-2.0-only #include <linux/module.h> #include <linux/errno.h> #include <linux/socket.h> #include <linux/udp.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/in6.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/ip6_tunnel.h> #include <net/ip6_checksum.h> int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, struct socket **sockp) { struct sockaddr_in6 udp6_addr = {}; int err; struct socket *sock = NULL; err = sock_create_kern(net, AF_INET6, SOCK_DGRAM, 0, &sock); if (err < 0) goto error; if (cfg->ipv6_v6only) { err = ip6_sock_set_v6only(sock->sk); if (err < 0) goto error; } if (cfg->bind_ifindex) { err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->local_udp_port; err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, sizeof(udp6_addr)); if (err < 0) goto error; if (cfg->peer_udp_port) { memset(&udp6_addr, 0, sizeof(udp6_addr)); udp6_addr.sin6_family = AF_INET6; memcpy(&udp6_addr.sin6_addr, &cfg->peer_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->peer_udp_port; err = kernel_connect(sock, (struct sockaddr *)&udp6_addr, sizeof(udp6_addr), 0); } if (err < 0) goto error; udp_set_no_check6_tx(sock->sk, !cfg->use_udp6_tx_checksums); udp_set_no_check6_rx(sock->sk, !cfg->use_udp6_rx_checksums); *sockp = sock; return 0; error: if (sock) { kernel_sock_shutdown(sock, SHUT_RDWR); sock_release(sock); } *sockp = NULL; return err; } EXPORT_SYMBOL_GPL(udp_sock_create6); int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, __be16 src_port, __be16 dst_port, bool nocheck) { struct udphdr *uh; struct ipv6hdr *ip6h; __skb_push(skb, sizeof(*uh)); skb_reset_transport_header(skb); uh = udp_hdr(skb); uh->dest = dst_port; uh->source = src_port; uh->len = htons(skb->len); skb_dst_set(skb, dst); udp6_set_csum(nocheck, skb, saddr, daddr, skb->len); __skb_push(skb, sizeof(*ip6h)); skb_reset_network_header(skb); ip6h = ipv6_hdr(skb); ip6_flow_hdr(ip6h, prio, label); ip6h->payload_len = htons(skb->len); ip6h->nexthdr = IPPROTO_UDP; ip6h->hop_limit = ttl; ip6h->daddr = *daddr; ip6h->saddr = *saddr; ip6tunnel_xmit(sk, skb, dev); return 0; } EXPORT_SYMBOL_GPL(udp_tunnel6_xmit_skb); MODULE_LICENSE("GPL");
46 46 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 // SPDX-License-Identifier: GPL-2.0-or-later /* rxrpc network namespace handling. * * Copyright (C) 2017 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/proc_fs.h> #include "ar-internal.h" unsigned int rxrpc_net_id; static void rxrpc_client_conn_reap_timeout(struct timer_list *timer) { struct rxrpc_net *rxnet = container_of(timer, struct rxrpc_net, client_conn_reap_timer); if (rxnet->live) rxrpc_queue_work(&rxnet->client_conn_reaper); } static void rxrpc_service_conn_reap_timeout(struct timer_list *timer) { struct rxrpc_net *rxnet = container_of(timer, struct rxrpc_net, service_conn_reap_timer); if (rxnet->live) rxrpc_queue_work(&rxnet->service_conn_reaper); } static void rxrpc_peer_keepalive_timeout(struct timer_list *timer) { struct rxrpc_net *rxnet = container_of(timer, struct rxrpc_net, peer_keepalive_timer); if (rxnet->live) rxrpc_queue_work(&rxnet->peer_keepalive_work); } /* * Initialise a per-network namespace record. */ static __net_init int rxrpc_init_net(struct net *net) { struct rxrpc_net *rxnet = rxrpc_net(net); int ret, i; rxnet->live = true; get_random_bytes(&rxnet->epoch, sizeof(rxnet->epoch)); rxnet->epoch |= RXRPC_RANDOM_EPOCH; INIT_LIST_HEAD(&rxnet->calls); spin_lock_init(&rxnet->call_lock); atomic_set(&rxnet->nr_calls, 1); atomic_set(&rxnet->nr_conns, 1); INIT_LIST_HEAD(&rxnet->conn_proc_list); INIT_LIST_HEAD(&rxnet->service_conns); rwlock_init(&rxnet->conn_lock); INIT_WORK(&rxnet->service_conn_reaper, rxrpc_service_connection_reaper); timer_setup(&rxnet->service_conn_reap_timer, rxrpc_service_conn_reap_timeout, 0); atomic_set(&rxnet->nr_client_conns, 0); rxnet->kill_all_client_conns = false; spin_lock_init(&rxnet->client_conn_cache_lock); spin_lock_init(&rxnet->client_conn_discard_lock); INIT_LIST_HEAD(&rxnet->idle_client_conns); INIT_WORK(&rxnet->client_conn_reaper, rxrpc_discard_expired_client_conns); timer_setup(&rxnet->client_conn_reap_timer, rxrpc_client_conn_reap_timeout, 0); INIT_HLIST_HEAD(&rxnet->local_endpoints); mutex_init(&rxnet->local_mutex); hash_init(rxnet->peer_hash); spin_lock_init(&rxnet->peer_hash_lock); for (i = 0; i < ARRAY_SIZE(rxnet->peer_keepalive); i++) INIT_LIST_HEAD(&rxnet->peer_keepalive[i]); INIT_LIST_HEAD(&rxnet->peer_keepalive_new); timer_setup(&rxnet->peer_keepalive_timer, rxrpc_peer_keepalive_timeout, 0); INIT_WORK(&rxnet->peer_keepalive_work, rxrpc_peer_keepalive_worker); rxnet->peer_keepalive_base = ktime_get_seconds(); ret = -ENOMEM; rxnet->proc_net = proc_net_mkdir(net, "rxrpc", net->proc_net); if (!rxnet->proc_net) goto err_proc; proc_create_net("calls", 0444, rxnet->proc_net, &rxrpc_call_seq_ops, sizeof(struct seq_net_private)); proc_create_net("conns", 0444, rxnet->proc_net, &rxrpc_connection_seq_ops, sizeof(struct seq_net_private)); proc_create_net("peers", 0444, rxnet->proc_net, &rxrpc_peer_seq_ops, sizeof(struct seq_net_private)); proc_create_net("locals", 0444, rxnet->proc_net, &rxrpc_local_seq_ops, sizeof(struct seq_net_private)); return 0; err_proc: rxnet->live = false; return ret; } /* * Clean up a per-network namespace record. */ static __net_exit void rxrpc_exit_net(struct net *net) { struct rxrpc_net *rxnet = rxrpc_net(net); rxnet->live = false; del_timer_sync(&rxnet->peer_keepalive_timer); cancel_work_sync(&rxnet->peer_keepalive_work); /* Remove the timer again as the worker may have restarted it. */ del_timer_sync(&rxnet->peer_keepalive_timer); rxrpc_destroy_all_calls(rxnet); rxrpc_destroy_all_connections(rxnet); rxrpc_destroy_all_peers(rxnet); rxrpc_destroy_all_locals(rxnet); proc_remove(rxnet->proc_net); } struct pernet_operations rxrpc_net_ops = { .init = rxrpc_init_net, .exit = rxrpc_exit_net, .id = &rxrpc_net_id, .size = sizeof(struct rxrpc_net), };
29 29 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* * net/tipc/addr.c: TIPC address utility routines * * Copyright (c) 2000-2006, 2018, Ericsson AB * Copyright (c) 2004-2005, 2010-2011, Wind River Systems * Copyright (c) 2020-2021, Red Hat Inc * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "addr.h" #include "core.h" bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr) { if (!domain || (domain == addr)) return true; if (!legacy_format) return false; if (domain == tipc_cluster_mask(addr)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_CLUSTER_MASK)) /* domain <Z.C.0> */ return true; if (domain == (addr & TIPC_ZONE_MASK)) /* domain <Z.0.0> */ return true; return false; } void tipc_set_node_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); memcpy(tn->node_id, id, NODE_ID_LEN); tipc_nodeid2string(tn->node_id_string, id); tn->trial_addr = hash128to32(id); pr_info("Node identity %s, cluster identity %u\n", tipc_own_id_string(net), tn->net_id); } void tipc_set_node_addr(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); u8 node_id[NODE_ID_LEN] = {0,}; tn->node_addr = addr; if (!tipc_own_id(net)) { sprintf(node_id, "%x", addr); tipc_set_node_id(net, node_id); } tn->trial_addr = addr; tn->addr_trial_end = jiffies; pr_info("Node number set to %u\n", addr); } char *tipc_nodeid2string(char *str, u8 *id) { int i; u8 c; /* Already a string ? */ for (i = 0; i < NODE_ID_LEN; i++) { c = id[i]; if (c >= '0' && c <= '9') continue; if (c >= 'A' && c <= 'Z') continue; if (c >= 'a' && c <= 'z') continue; if (c == '.') continue; if (c == ':') continue; if (c == '_') continue; if (c == '-') continue; if (c == '@') continue; if (c != 0) break; } if (i == NODE_ID_LEN) { memcpy(str, id, NODE_ID_LEN); str[NODE_ID_LEN] = 0; return str; } /* Translate to hex string */ for (i = 0; i < NODE_ID_LEN; i++) sprintf(&str[2 * i], "%02x", id[i]); /* Strip off trailing zeroes */ for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) str[i] = 0; return str; }
2170 47 3372 3380 2126 1708 6 2904 1738 1748 1410 337 1747 3 3 860 836 70 117 859 161 250 715 857 860 3 9 2 9 931 935 933 929 934 162 162 162 162 868 54 104 1 17 1 892 227 761 40 900 902 899 41 8 36 1097 1095 514 500 28 28 4 21 7 406 393 14 388 12 1 2 1 5 4 8 10 86 96 932 26 933 935 934 932 933 936 936 935 965 969 968 966 967 966 7 966 965 7 968 4 5 967 969 967 7 7 1 966 967 967 965 970 969 7 966 1 970 968 963 1 968 7 1 968 964 939 141 968 964 966 965 967 612 514 964 966 969 966 1 250 33 949 966 969 1 966 968 969 1 969 969 967 2 971 970 2 1 968 2 1 1 1 1 1 1 1 2 965 1 1 941 74 2 200 786 8 2 968 900 1 969 966 968 965 32 1 916 79 79 64 3 64 2 66 6 6 68 68 68 3 3 65 65 2 42 2 3 50 61 66 66 66 64 64 66 66 5 3 8 8 4 1 6 6 225 222 24 210 211 201 17 9 2 9 2 2 2 2 2 2 2 2 2 2 90 19 16 4 31 2 70 4 1 2 1 2 2 203 4 4 194 198 7 1 1 1 190 181 4 6 165 3 19 183 1 181 4 1 141 40 178 5 183 180 178 83 95 133 2 4 128 127 2 1 127 1 1 115 11 113 100 8 13 3 2 2 105 4 102 1 3 92 9 102 4 92 1 1 4 1 1 3 2 97 66 48 77 43 63 1 1 61 57 3 1 10 54 1 1 1 1 1 64 81 1 2 5 71 2 68 1 1 1 31 32 65 69 2 2 1 3 170 1 5 19 147 40 5 11 152 3 160 151 38 1 2 1 149 1 147 1 1 14 1 2 1 1 3 3 3 174 175 1 10 11 11 10 1 7 5 2 4 3 2 2 7 12 1 7 3 1 2 1 2 10 7 18 2 28 28 28 44 44 32 44 42 44 27 23 42 904 902 124 20 888 754 754 16 6 3 16 4 9 9 4 3 1 4 5 17 17 1 1 5 2 2 1 12 12 8 4 5 2 1 1 9 1 1 2 2 2 5 3 5 1 7 1 7 10 9 7 7 5 1 5 11 11 1 10 10 10 10 9 7 1 9 7 6 10 6 6 6 1 2 1 1 9 10 1 17 4 12 12 11 2 12 17 17 5 17 17 17 1 15 17 1 17 17 33 1 14 17 2 13 1 15 12 9 14 14 2 13 3 17 1 2 13 2 3 9 4 8 6 3 6 3 34 17 7 34 34 4 23 11 12 34 17 2 12 34 19 9 8 13 34 1 17 16 34 1 26 26 26 1 13 26 26 23 23 8 4 3 2 7 1 2 7 3 6 6 6 6 6 28 1 41 13 14 1 3 1 9 9 28 1 1 26 25 2 24 25 1742 2 1 1743 1740 1479 283 26 226 55 280 281 1476 287 178 179 1298 1297 3 2 1760 924 866 310 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Routing netlink socket interface: protocol independent part. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * Vitaly E. Lavrov RTA_OK arithmetic was wrong. */ #include <linux/bitops.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/capability.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/security.h> #include <linux/mutex.h> #include <linux/if_addr.h> #include <linux/if_bridge.h> #include <linux/if_vlan.h> #include <linux/pci.h> #include <linux/etherdevice.h> #include <linux/bpf.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> #include <net/route.h> #include <net/udp.h> #include <net/tcp.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <net/fib_rules.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #define RTNL_MAX_TYPE 50 #define RTNL_SLAVE_MAX_TYPE 40 struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; struct module *owner; unsigned int flags; struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); void rtnl_lock(void) { mutex_lock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_lock); int rtnl_lock_killable(void) { return mutex_lock_killable(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_lock_killable); static struct sk_buff *defer_kfree_skb_list; void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail) { if (head && tail) { tail->next = defer_kfree_skb_list; defer_kfree_skb_list = head; } } EXPORT_SYMBOL(rtnl_kfree_skbs); void __rtnl_unlock(void) { struct sk_buff *head = defer_kfree_skb_list; defer_kfree_skb_list = NULL; mutex_unlock(&rtnl_mutex); while (head) { struct sk_buff *next = head->next; kfree_skb(head); cond_resched(); head = next; } } void rtnl_unlock(void) { /* This fellow will unlock it for us. */ netdev_run_todo(); } EXPORT_SYMBOL(rtnl_unlock); int rtnl_trylock(void) { return mutex_trylock(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_trylock); int rtnl_is_locked(void) { return mutex_is_locked(&rtnl_mutex); } EXPORT_SYMBOL(rtnl_is_locked); bool refcount_dec_and_rtnl_lock(refcount_t *r) { return refcount_dec_and_mutex_lock(r, &rtnl_mutex); } EXPORT_SYMBOL(refcount_dec_and_rtnl_lock); #ifdef CONFIG_PROVE_LOCKING bool lockdep_rtnl_is_held(void) { return lockdep_is_held(&rtnl_mutex); } EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { int msgindex = msgtype - RTM_BASE; /* * msgindex < 0 implies someone tried to register a netlink * control code. msgindex >= RTM_NR_MSGTYPES may indicate that * the message type has not been added to linux/rtnetlink.h */ BUG_ON(msgindex < 0 || msgindex >= RTM_NR_MSGTYPES); return msgindex; } static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) { struct rtnl_link __rcu **tab; if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) protocol = PF_UNSPEC; tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); if (!tab) tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); return rcu_dereference_rtnl(tab[msgtype]); } static int rtnl_register_internal(struct module *owner, int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { struct rtnl_link *link, *old; struct rtnl_link __rcu **tab; int msgindex; int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (tab == NULL) { tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); if (!tab) goto unlock; /* ensures we see the 0 stores */ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } old = rtnl_dereference(tab[msgindex]); if (old) { link = kmemdup(old, sizeof(*old), GFP_KERNEL); if (!link) goto unlock; } else { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) goto unlock; } WARN_ON(link->owner && link->owner != owner); link->owner = owner; WARN_ON(doit && link->doit && link->doit != doit); if (doit) link->doit = doit; WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); if (dumpit) link->dumpit = dumpit; link->flags |= flags; /* publish protocol:msgtype */ rcu_assign_pointer(tab[msgindex], link); ret = 0; if (old) kfree_rcu(old, rcu); unlock: rtnl_unlock(); return ret; } /** * rtnl_register_module - Register a rtnetlink message type * * @owner: module registering the hook (THIS_MODULE) * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions * * Like rtnl_register, but for use by removable modules. */ int rtnl_register_module(struct module *owner, int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { return rtnl_register_internal(owner, protocol, msgtype, doit, dumpit, flags); } EXPORT_SYMBOL_GPL(rtnl_register_module); /** * rtnl_register - Register a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * @doit: Function pointer called for each request message * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message * @flags: rtnl_link_flags to modify behaviour of doit/dumpit functions * * Registers the specified function pointers (at least one of them has * to be non-NULL) to be called whenever a request message for the * specified protocol family and message type is received. * * The special protocol family PF_UNSPEC may be used to define fallback * function pointers for the case when no entry for the specific protocol * family exists. */ void rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { int err; err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, flags); if (err) pr_err("Unable to register rtnetlink message handler, " "protocol = %d, message type = %d\n", protocol, msgtype); } /** * rtnl_unregister - Unregister a rtnetlink message type * @protocol: Protocol family or PF_UNSPEC * @msgtype: rtnetlink message type * * Returns 0 on success or a negative error code. */ int rtnl_unregister(int protocol, int msgtype) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return -ENOENT; } link = rtnl_dereference(tab[msgindex]); rcu_assign_pointer(tab[msgindex], NULL); rtnl_unlock(); kfree_rcu(link, rcu); return 0; } EXPORT_SYMBOL_GPL(rtnl_unregister); /** * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol * @protocol : Protocol family or PF_UNSPEC * * Identical to calling rtnl_unregster() for all registered message types * of a certain protocol family. */ void rtnl_unregister_all(int protocol) { struct rtnl_link __rcu **tab; struct rtnl_link *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); tab = rtnl_dereference(rtnl_msg_handlers[protocol]); if (!tab) { rtnl_unlock(); return; } RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { link = rtnl_dereference(tab[msgindex]); if (!link) continue; rcu_assign_pointer(tab[msgindex], NULL); kfree_rcu(link, rcu); } rtnl_unlock(); synchronize_net(); kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); int __rtnl_register_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; int i, err; for (i = 0, handler = handlers; i < n; i++, handler++) { err = rtnl_register_internal(handler->owner, handler->protocol, handler->msgtype, handler->doit, handler->dumpit, handler->flags); if (err) { __rtnl_unregister_many(handlers, i); break; } } return err; } EXPORT_SYMBOL_GPL(__rtnl_register_many); void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n) { const struct rtnl_msg_handler *handler; int i; for (i = n - 1, handler = handlers + n - 1; i >= 0; i--, handler--) rtnl_unregister(handler->protocol, handler->msgtype); } EXPORT_SYMBOL_GPL(__rtnl_unregister_many); static LIST_HEAD(link_ops); static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind) { const struct rtnl_link_ops *ops; list_for_each_entry(ops, &link_ops, list) { if (!strcmp(ops->kind, kind)) return ops; } return NULL; } /** * __rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * * The caller must hold the rtnl_mutex. This function should be used * by drivers that create devices during module initialization. It * must be called before registering the devices. * * Returns 0 on success or a negative error code. */ int __rtnl_link_register(struct rtnl_link_ops *ops) { if (rtnl_link_ops_get(ops->kind)) return -EEXIST; /* The check for alloc/setup is here because if ops * does not have that filled up, it is not possible * to use the ops for creating device. So do not * fill up dellink as well. That disables rtnl_dellink. */ if ((ops->alloc || ops->setup) && !ops->dellink) ops->dellink = unregister_netdevice_queue; list_add_tail(&ops->list, &link_ops); return 0; } EXPORT_SYMBOL_GPL(__rtnl_link_register); /** * rtnl_link_register - Register rtnl_link_ops with rtnetlink. * @ops: struct rtnl_link_ops * to register * * Returns 0 on success or a negative error code. */ int rtnl_link_register(struct rtnl_link_ops *ops) { int err; /* Sanity-check max sizes to avoid stack buffer overflow. */ if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE || ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE)) return -EINVAL; rtnl_lock(); err = __rtnl_link_register(ops); rtnl_unlock(); return err; } EXPORT_SYMBOL_GPL(rtnl_link_register); static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) { struct net_device *dev; LIST_HEAD(list_kill); for_each_netdev(net, dev) { if (dev->rtnl_link_ops == ops) ops->dellink(dev, &list_kill); } unregister_netdevice_many(&list_kill); } /** * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister * * The caller must hold the rtnl_mutex and guarantee net_namespace_list * integrity (hold pernet_ops_rwsem for writing to close the race * with setup_net() and cleanup_net()). */ void __rtnl_link_unregister(struct rtnl_link_ops *ops) { struct net *net; for_each_net(net) { __rtnl_kill_links(net, ops); } list_del(&ops->list); } EXPORT_SYMBOL_GPL(__rtnl_link_unregister); /* Return with the rtnl_lock held when there are no network * devices unregistering in any network namespace. */ static void rtnl_lock_unregistering_all(void) { struct net *net; bool unregistering; DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { unregistering = false; rtnl_lock(); /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ for_each_net(net) { if (net->dev_unreg_count > 0) { unregistering = true; break; } } if (!unregistering) break; __rtnl_unlock(); wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&netdev_unregistering_wq, &wait); } /** * rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister */ void rtnl_link_unregister(struct rtnl_link_ops *ops) { /* Close the race with setup_net() and cleanup_net() */ down_write(&pernet_ops_rwsem); rtnl_lock_unregistering_all(); __rtnl_link_unregister(ops); rtnl_unlock(); up_write(&pernet_ops_rwsem); } EXPORT_SYMBOL_GPL(rtnl_link_unregister); static size_t rtnl_link_get_slave_info_data_size(const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; size_t size = 0; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (!master_dev) goto out; ops = master_dev->rtnl_link_ops; if (!ops || !ops->get_slave_size) goto out; /* IFLA_INFO_SLAVE_DATA + nested data */ size = nla_total_size(sizeof(struct nlattr)) + ops->get_slave_size(master_dev, dev); out: rcu_read_unlock(); return size; } static size_t rtnl_link_get_size(const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; size_t size; if (!ops) return 0; size = nla_total_size(sizeof(struct nlattr)) + /* IFLA_LINKINFO */ nla_total_size(strlen(ops->kind) + 1); /* IFLA_INFO_KIND */ if (ops->get_size) /* IFLA_INFO_DATA + nested data */ size += nla_total_size(sizeof(struct nlattr)) + ops->get_size(dev); if (ops->get_xstats_size) /* IFLA_INFO_XSTATS */ size += nla_total_size(ops->get_xstats_size(dev)); size += rtnl_link_get_slave_info_data_size(dev); return size; } static LIST_HEAD(rtnl_af_ops); static const struct rtnl_af_ops *rtnl_af_lookup(const int family) { const struct rtnl_af_ops *ops; ASSERT_RTNL(); list_for_each_entry(ops, &rtnl_af_ops, list) { if (ops->family == family) return ops; } return NULL; } /** * rtnl_af_register - Register rtnl_af_ops with rtnetlink. * @ops: struct rtnl_af_ops * to register * * Returns 0 on success or a negative error code. */ void rtnl_af_register(struct rtnl_af_ops *ops) { rtnl_lock(); list_add_tail_rcu(&ops->list, &rtnl_af_ops); rtnl_unlock(); } EXPORT_SYMBOL_GPL(rtnl_af_register); /** * rtnl_af_unregister - Unregister rtnl_af_ops from rtnetlink. * @ops: struct rtnl_af_ops * to unregister */ void rtnl_af_unregister(struct rtnl_af_ops *ops) { rtnl_lock(); list_del_rcu(&ops->list); rtnl_unlock(); synchronize_rcu(); } EXPORT_SYMBOL_GPL(rtnl_af_unregister); static size_t rtnl_link_get_af_size(const struct net_device *dev, u32 ext_filter_mask) { struct rtnl_af_ops *af_ops; size_t size; /* IFLA_AF_SPEC */ size = nla_total_size(sizeof(struct nlattr)); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_link_af_size) { /* AF_* + nested data */ size += nla_total_size(sizeof(struct nlattr)) + af_ops->get_link_af_size(dev, ext_filter_mask); } } rcu_read_unlock(); return size; } static bool rtnl_have_link_slave_info(const struct net_device *dev) { struct net_device *master_dev; bool ret = false; rcu_read_lock(); master_dev = netdev_master_upper_dev_get_rcu((struct net_device *)dev); if (master_dev && master_dev->rtnl_link_ops) ret = true; rcu_read_unlock(); return ret; } static int rtnl_link_slave_info_fill(struct sk_buff *skb, const struct net_device *dev) { struct net_device *master_dev; const struct rtnl_link_ops *ops; struct nlattr *slave_data; int err; master_dev = netdev_master_upper_dev_get((struct net_device *) dev); if (!master_dev) return 0; ops = master_dev->rtnl_link_ops; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_SLAVE_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_slave_info) { slave_data = nla_nest_start_noflag(skb, IFLA_INFO_SLAVE_DATA); if (!slave_data) return -EMSGSIZE; err = ops->fill_slave_info(skb, master_dev, dev); if (err < 0) goto err_cancel_slave_data; nla_nest_end(skb, slave_data); } return 0; err_cancel_slave_data: nla_nest_cancel(skb, slave_data); return err; } static int rtnl_link_info_fill(struct sk_buff *skb, const struct net_device *dev) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; struct nlattr *data; int err; if (!ops) return 0; if (nla_put_string(skb, IFLA_INFO_KIND, ops->kind) < 0) return -EMSGSIZE; if (ops->fill_xstats) { err = ops->fill_xstats(skb, dev); if (err < 0) return err; } if (ops->fill_info) { data = nla_nest_start_noflag(skb, IFLA_INFO_DATA); if (data == NULL) return -EMSGSIZE; err = ops->fill_info(skb, dev); if (err < 0) goto err_cancel_data; nla_nest_end(skb, data); } return 0; err_cancel_data: nla_nest_cancel(skb, data); return err; } static int rtnl_link_fill(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *linkinfo; int err = -EMSGSIZE; linkinfo = nla_nest_start_noflag(skb, IFLA_LINKINFO); if (linkinfo == NULL) goto out; err = rtnl_link_info_fill(skb, dev); if (err < 0) goto err_cancel_link; err = rtnl_link_slave_info_fill(skb, dev); if (err < 0) goto err_cancel_link; nla_nest_end(skb, linkinfo); return 0; err_cancel_link: nla_nest_cancel(skb, linkinfo); out: return err; } int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group, int echo) { struct sock *rtnl = net->rtnl; return nlmsg_notify(rtnl, skb, pid, group, echo, GFP_KERNEL); } int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid) { struct sock *rtnl = net->rtnl; return nlmsg_unicast(rtnl, skb, pid); } EXPORT_SYMBOL(rtnl_unicast); void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, struct nlmsghdr *nlh, gfp_t flags) { struct sock *rtnl = net->rtnl; nlmsg_notify(rtnl, skb, pid, group, nlmsg_report(nlh), flags); } EXPORT_SYMBOL(rtnl_notify); void rtnl_set_sk_err(struct net *net, u32 group, int error) { struct sock *rtnl = net->rtnl; netlink_set_err(rtnl, 0, group, error); } EXPORT_SYMBOL(rtnl_set_sk_err); int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) { struct nlattr *mx; int i, valid = 0; /* nothing is dumped for dst_default_metrics, so just skip the loop */ if (metrics == dst_default_metrics.metrics) return 0; mx = nla_nest_start_noflag(skb, RTA_METRICS); if (mx == NULL) return -ENOBUFS; for (i = 0; i < RTAX_MAX; i++) { if (metrics[i]) { if (i == RTAX_CC_ALGO - 1) { char tmp[TCP_CA_NAME_MAX], *name; name = tcp_ca_get_name_by_key(metrics[i], tmp); if (!name) continue; if (nla_put_string(skb, i + 1, name)) goto nla_put_failure; } else if (i == RTAX_FEATURES - 1) { u32 user_features = metrics[i] & RTAX_FEATURE_MASK; if (!user_features) continue; BUILD_BUG_ON(RTAX_FEATURE_MASK & DST_FEATURE_MASK); if (nla_put_u32(skb, i + 1, user_features)) goto nla_put_failure; } else { if (nla_put_u32(skb, i + 1, metrics[i])) goto nla_put_failure; } valid++; } } if (!valid) { nla_nest_cancel(skb, mx); return 0; } return nla_nest_end(skb, mx); nla_put_failure: nla_nest_cancel(skb, mx); return -EMSGSIZE; } EXPORT_SYMBOL(rtnetlink_put_metrics); int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error) { struct rta_cacheinfo ci = { .rta_error = error, .rta_id = id, }; if (dst) { ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); ci.rta_used = dst->__use; ci.rta_clntref = atomic_read(&dst->__refcnt); } if (expires) { unsigned long clock; clock = jiffies_to_clock_t(abs(expires)); clock = min_t(unsigned long, clock, INT_MAX); ci.rta_expires = (expires > 0) ? clock : -clock; } return nla_put(skb, RTA_CACHEINFO, sizeof(ci), &ci); } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); static void set_operstate(struct net_device *dev, unsigned char transition) { unsigned char operstate = dev->operstate; switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; case IF_OPER_TESTING: if (operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN) operstate = IF_OPER_TESTING; break; case IF_OPER_DORMANT: if (operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN) operstate = IF_OPER_DORMANT; break; } if (dev->operstate != operstate) { write_lock(&dev_base_lock); dev->operstate = operstate; write_unlock(&dev_base_lock); netdev_state_change(dev); } } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) { return (dev->flags & ~(IFF_PROMISC | IFF_ALLMULTI)) | (dev->gflags & (IFF_PROMISC | IFF_ALLMULTI)); } static unsigned int rtnl_dev_combine_flags(const struct net_device *dev, const struct ifinfomsg *ifm) { unsigned int flags = ifm->ifi_flags; /* bugwards compatibility: ifi_change == 0 is treated as ~0 */ if (ifm->ifi_change) flags = (flags & ifm->ifi_change) | (rtnl_dev_get_flags(dev) & ~ifm->ifi_change); return flags; } static void copy_rtnl_link_stats(struct rtnl_link_stats *a, const struct rtnl_link_stats64 *b) { a->rx_packets = b->rx_packets; a->tx_packets = b->tx_packets; a->rx_bytes = b->rx_bytes; a->tx_bytes = b->tx_bytes; a->rx_errors = b->rx_errors; a->tx_errors = b->tx_errors; a->rx_dropped = b->rx_dropped; a->tx_dropped = b->tx_dropped; a->multicast = b->multicast; a->collisions = b->collisions; a->rx_length_errors = b->rx_length_errors; a->rx_over_errors = b->rx_over_errors; a->rx_crc_errors = b->rx_crc_errors; a->rx_frame_errors = b->rx_frame_errors; a->rx_fifo_errors = b->rx_fifo_errors; a->rx_missed_errors = b->rx_missed_errors; a->tx_aborted_errors = b->tx_aborted_errors; a->tx_carrier_errors = b->tx_carrier_errors; a->tx_fifo_errors = b->tx_fifo_errors; a->tx_heartbeat_errors = b->tx_heartbeat_errors; a->tx_window_errors = b->tx_window_errors; a->rx_compressed = b->rx_compressed; a->tx_compressed = b->tx_compressed; a->rx_nohandler = b->rx_nohandler; } /* All VF info */ static inline int rtnl_vfinfo_size(const struct net_device *dev, u32 ext_filter_mask) { if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); size_t size = nla_total_size(0); size += num_vfs * (nla_total_size(0) + nla_total_size(sizeof(struct ifla_vf_mac)) + nla_total_size(sizeof(struct ifla_vf_broadcast)) + nla_total_size(sizeof(struct ifla_vf_vlan)) + nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */ nla_total_size(MAX_VLAN_LIST_LEN * sizeof(struct ifla_vf_vlan_info)) + nla_total_size(sizeof(struct ifla_vf_spoofchk)) + nla_total_size(sizeof(struct ifla_vf_tx_rate)) + nla_total_size(sizeof(struct ifla_vf_rate)) + nla_total_size(sizeof(struct ifla_vf_link_state)) + nla_total_size(sizeof(struct ifla_vf_rss_query_en)) + nla_total_size(sizeof(struct ifla_vf_trust))); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { size += num_vfs * (nla_total_size(0) + /* nest IFLA_VF_STATS */ /* IFLA_VF_STATS_RX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_PACKETS */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_BYTES */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_BROADCAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_RX_DROPPED */ nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_TX_DROPPED */ nla_total_size_64bit(sizeof(__u64))); } return size; } else return 0; } static size_t rtnl_port_size(const struct net_device *dev, u32 ext_filter_mask) { size_t port_size = nla_total_size(4) /* PORT_VF */ + nla_total_size(PORT_PROFILE_MAX) /* PORT_PROFILE */ + nla_total_size(PORT_UUID_MAX) /* PORT_INSTANCE_UUID */ + nla_total_size(PORT_UUID_MAX) /* PORT_HOST_UUID */ + nla_total_size(1) /* PROT_VDP_REQUEST */ + nla_total_size(2); /* PORT_VDP_RESPONSE */ size_t vf_ports_size = nla_total_size(sizeof(struct nlattr)); size_t vf_port_size = nla_total_size(sizeof(struct nlattr)) + port_size; size_t port_self_size = nla_total_size(sizeof(struct nlattr)) + port_size; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; if (dev_num_vf(dev->dev.parent)) return port_self_size + vf_ports_size + vf_port_size * dev_num_vf(dev->dev.parent); else return port_self_size; } static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ nla_total_size(1) + /* XDP_ATTACHED */ nla_total_size(4) + /* XDP_PROG_ID (or 1st mode) */ nla_total_size(4); /* XDP_<mode>_PROG_ID */ return xdp_size; } static size_t rtnl_prop_list_size(const struct net_device *dev) { struct netdev_name_node *name_node; size_t size; if (list_empty(&dev->name_node->list)) return 0; size = nla_total_size(0); list_for_each_entry(name_node, &dev->name_node->list, list) size += nla_total_size(ALTIFNAMSIZ); return size; } static size_t rtnl_proto_down_size(const struct net_device *dev) { size_t size = nla_total_size(1); if (dev->proto_down_reason) size += nla_total_size(0) + nla_total_size(4); return size; } static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) + nla_total_size(sizeof(struct rtnl_link_stats)) + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ + nla_total_size(4) /* IFLA_WEIGHT */ + nla_total_size(4) /* IFLA_MTU */ + nla_total_size(4) /* IFLA_LINK */ + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ + nla_total_size(4) /* IFLA_GSO_MAX_SIZE */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ + nla_total_size(4) /* IFLA_CARRIER_CHANGES */ + nla_total_size(4) /* IFLA_LINK_NETNSID */ + nla_total_size(4) /* IFLA_GROUP */ + nla_total_size(ext_filter_mask & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev, ext_filter_mask) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + rtnl_proto_down_size(dev) /* proto down */ + nla_total_size(4) /* IFLA_TARGET_NETNSID */ + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + nla_total_size(4) /* IFLA_MIN_MTU */ + nla_total_size(4) /* IFLA_MAX_MTU */ + rtnl_prop_list_size(dev) + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */ + 0; } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *vf_ports; struct nlattr *vf_port; int vf; int err; vf_ports = nla_nest_start_noflag(skb, IFLA_VF_PORTS); if (!vf_ports) return -EMSGSIZE; for (vf = 0; vf < dev_num_vf(dev->dev.parent); vf++) { vf_port = nla_nest_start_noflag(skb, IFLA_VF_PORT); if (!vf_port) goto nla_put_failure; if (nla_put_u32(skb, IFLA_PORT_VF, vf)) goto nla_put_failure; err = dev->netdev_ops->ndo_get_vf_port(dev, vf, skb); if (err == -EMSGSIZE) goto nla_put_failure; if (err) { nla_nest_cancel(skb, vf_port); continue; } nla_nest_end(skb, vf_port); } nla_nest_end(skb, vf_ports); return 0; nla_put_failure: nla_nest_cancel(skb, vf_ports); return -EMSGSIZE; } static int rtnl_port_self_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *port_self; int err; port_self = nla_nest_start_noflag(skb, IFLA_PORT_SELF); if (!port_self) return -EMSGSIZE; err = dev->netdev_ops->ndo_get_vf_port(dev, PORT_SELF_VF, skb); if (err) { nla_nest_cancel(skb, port_self); return (err == -EMSGSIZE) ? err : 0; } nla_nest_end(skb, port_self); return 0; } static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { int err; if (!dev->netdev_ops->ndo_get_vf_port || !dev->dev.parent || !(ext_filter_mask & RTEXT_FILTER_VF)) return 0; err = rtnl_port_self_fill(skb, dev); if (err) return err; if (dev_num_vf(dev->dev.parent)) { err = rtnl_vf_ports_fill(skb, dev); if (err) return err; } return 0; } static int rtnl_phys_port_id_fill(struct sk_buff *skb, struct net_device *dev) { int err; struct netdev_phys_item_id ppid; err = dev_get_phys_port_id(dev, &ppid); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_PORT_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) { char name[IFNAMSIZ]; int err; err = dev_get_phys_port_name(dev, name, sizeof(name)); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; } static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev) { struct netdev_phys_item_id ppid = { }; int err; err = dev_get_port_parent_id(dev, &ppid, false); if (err) { if (err == -EOPNOTSUPP) return 0; return err; } if (nla_put(skb, IFLA_PHYS_SWITCH_ID, ppid.id_len, ppid.id)) return -EMSGSIZE; return 0; } static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_stats64 *sp; struct nlattr *attr; attr = nla_reserve_64bit(skb, IFLA_STATS64, sizeof(struct rtnl_link_stats64), IFLA_PAD); if (!attr) return -EMSGSIZE; sp = nla_data(attr); dev_get_stats(dev, sp); attr = nla_reserve(skb, IFLA_STATS, sizeof(struct rtnl_link_stats)); if (!attr) return -EMSGSIZE; copy_rtnl_link_stats(nla_data(attr), sp); return 0; } static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, struct net_device *dev, int vfs_num, struct nlattr *vfinfo, u32 ext_filter_mask) { struct ifla_vf_rss_query_en vf_rss_query_en; struct nlattr *vf, *vfstats, *vfvlanlist; struct ifla_vf_link_state vf_linkstate; struct ifla_vf_vlan_info vf_vlan_info; struct ifla_vf_spoofchk vf_spoofchk; struct ifla_vf_tx_rate vf_tx_rate; struct ifla_vf_stats vf_stats; struct ifla_vf_trust vf_trust; struct ifla_vf_vlan vf_vlan; struct ifla_vf_rate vf_rate; struct ifla_vf_mac vf_mac; struct ifla_vf_broadcast vf_broadcast; struct ifla_vf_info ivi; struct ifla_vf_guid node_guid; struct ifla_vf_guid port_guid; memset(&ivi, 0, sizeof(ivi)); /* Not all SR-IOV capable drivers support the * spoofcheck and "RSS query enable" query. Preset to * -1 so the user space tool can detect that the driver * didn't report anything. */ ivi.spoofchk = -1; ivi.rss_query_en = -1; ivi.trusted = -1; /* The default value for VF link state is "auto" * IFLA_VF_LINK_STATE_AUTO which equals zero */ ivi.linkstate = 0; /* VLAN Protocol by default is 802.1Q */ ivi.vlan_proto = htons(ETH_P_8021Q); if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi)) return 0; memset(&vf_vlan_info, 0, sizeof(vf_vlan_info)); memset(&node_guid, 0, sizeof(node_guid)); memset(&port_guid, 0, sizeof(port_guid)); vf_mac.vf = vf_vlan.vf = vf_vlan_info.vf = vf_rate.vf = vf_tx_rate.vf = vf_spoofchk.vf = vf_linkstate.vf = vf_rss_query_en.vf = vf_trust.vf = node_guid.vf = port_guid.vf = ivi.vf; memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac)); memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len); vf_vlan.vlan = ivi.vlan; vf_vlan.qos = ivi.qos; vf_vlan_info.vlan = ivi.vlan; vf_vlan_info.qos = ivi.qos; vf_vlan_info.vlan_proto = ivi.vlan_proto; vf_tx_rate.rate = ivi.max_tx_rate; vf_rate.min_tx_rate = ivi.min_tx_rate; vf_rate.max_tx_rate = ivi.max_tx_rate; vf_spoofchk.setting = ivi.spoofchk; vf_linkstate.link_state = ivi.linkstate; vf_rss_query_en.setting = ivi.rss_query_en; vf_trust.setting = ivi.trusted; vf = nla_nest_start_noflag(skb, IFLA_VF_INFO); if (!vf) goto nla_put_vfinfo_failure; if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) || nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) || nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) || nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate), &vf_rate) || nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate) || nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk), &vf_spoofchk) || nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate), &vf_linkstate) || nla_put(skb, IFLA_VF_RSS_QUERY_EN, sizeof(vf_rss_query_en), &vf_rss_query_en) || nla_put(skb, IFLA_VF_TRUST, sizeof(vf_trust), &vf_trust)) goto nla_put_vf_failure; if (dev->netdev_ops->ndo_get_vf_guid && !dev->netdev_ops->ndo_get_vf_guid(dev, vfs_num, &node_guid, &port_guid)) { if (nla_put(skb, IFLA_VF_IB_NODE_GUID, sizeof(node_guid), &node_guid) || nla_put(skb, IFLA_VF_IB_PORT_GUID, sizeof(port_guid), &port_guid)) goto nla_put_vf_failure; } vfvlanlist = nla_nest_start_noflag(skb, IFLA_VF_VLAN_LIST); if (!vfvlanlist) goto nla_put_vf_failure; if (nla_put(skb, IFLA_VF_VLAN_INFO, sizeof(vf_vlan_info), &vf_vlan_info)) { nla_nest_cancel(skb, vfvlanlist); goto nla_put_vf_failure; } nla_nest_end(skb, vfvlanlist); if (~ext_filter_mask & RTEXT_FILTER_SKIP_STATS) { memset(&vf_stats, 0, sizeof(vf_stats)); if (dev->netdev_ops->ndo_get_vf_stats) dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num, &vf_stats); vfstats = nla_nest_start_noflag(skb, IFLA_VF_STATS); if (!vfstats) goto nla_put_vf_failure; if (nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_PACKETS, vf_stats.rx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_PACKETS, vf_stats.tx_packets, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_BYTES, vf_stats.rx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_BYTES, vf_stats.tx_bytes, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, vf_stats.multicast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } nla_nest_end(skb, vfstats); } nla_nest_end(skb, vf); return 0; nla_put_vf_failure: nla_nest_cancel(skb, vf); nla_put_vfinfo_failure: nla_nest_cancel(skb, vfinfo); return -EMSGSIZE; } static noinline_for_stack int rtnl_fill_vf(struct sk_buff *skb, struct net_device *dev, u32 ext_filter_mask) { struct nlattr *vfinfo; int i, num_vfs; if (!dev->dev.parent || ((ext_filter_mask & RTEXT_FILTER_VF) == 0)) return 0; num_vfs = dev_num_vf(dev->dev.parent); if (nla_put_u32(skb, IFLA_NUM_VF, num_vfs)) return -EMSGSIZE; if (!dev->netdev_ops->ndo_get_vf_config) return 0; vfinfo = nla_nest_start_noflag(skb, IFLA_VFINFO_LIST); if (!vfinfo) return -EMSGSIZE; for (i = 0; i < num_vfs; i++) { if (rtnl_fill_vfinfo(skb, dev, i, vfinfo, ext_filter_mask)) return -EMSGSIZE; } nla_nest_end(skb, vfinfo); return 0; } static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) { struct rtnl_link_ifmap map; memset(&map, 0, sizeof(map)); map.mem_start = dev->mem_start; map.mem_end = dev->mem_end; map.base_addr = dev->base_addr; map.irq = dev->irq; map.dma = dev->dma; map.port = dev->if_port; if (nla_put_64bit(skb, IFLA_MAP, sizeof(map), &map, IFLA_PAD)) return -EMSGSIZE; return 0; } static u32 rtnl_xdp_prog_skb(struct net_device *dev) { const struct bpf_prog *generic_xdp_prog; ASSERT_RTNL(); generic_xdp_prog = rtnl_dereference(dev->xdp_prog); if (!generic_xdp_prog) return 0; return generic_xdp_prog->aux->id; } static u32 rtnl_xdp_prog_drv(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_DRV); } static u32 rtnl_xdp_prog_hw(struct net_device *dev) { return dev_xdp_prog_id(dev, XDP_MODE_HW); } static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev, u32 *prog_id, u8 *mode, u8 tgt_mode, u32 attr, u32 (*get_prog_id)(struct net_device *dev)) { u32 curr_id; int err; curr_id = get_prog_id(dev); if (!curr_id) return 0; *prog_id = curr_id; err = nla_put_u32(skb, attr, curr_id); if (err) return err; if (*mode != XDP_ATTACHED_NONE) *mode = XDP_ATTACHED_MULTI; else *mode = tgt_mode; return 0; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { struct nlattr *xdp; u32 prog_id; int err; u8 mode; xdp = nla_nest_start_noflag(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; prog_id = 0; mode = XDP_ATTACHED_NONE; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_SKB, IFLA_XDP_SKB_PROG_ID, rtnl_xdp_prog_skb); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_DRV, IFLA_XDP_DRV_PROG_ID, rtnl_xdp_prog_drv); if (err) goto err_cancel; err = rtnl_xdp_report_one(skb, dev, &prog_id, &mode, XDP_ATTACHED_HW, IFLA_XDP_HW_PROG_ID, rtnl_xdp_prog_hw); if (err) goto err_cancel; err = nla_put_u8(skb, IFLA_XDP_ATTACHED, mode); if (err) goto err_cancel; if (prog_id && mode != XDP_ATTACHED_MULTI) { err = nla_put_u32(skb, IFLA_XDP_PROG_ID, prog_id); if (err) goto err_cancel; } nla_nest_end(skb, xdp); return 0; err_cancel: nla_nest_cancel(skb, xdp); return err; } static u32 rtnl_get_event(unsigned long event) { u32 rtnl_event_type = IFLA_EVENT_NONE; switch (event) { case NETDEV_REBOOT: rtnl_event_type = IFLA_EVENT_REBOOT; break; case NETDEV_FEAT_CHANGE: rtnl_event_type = IFLA_EVENT_FEATURES; break; case NETDEV_BONDING_FAILOVER: rtnl_event_type = IFLA_EVENT_BONDING_FAILOVER; break; case NETDEV_NOTIFY_PEERS: rtnl_event_type = IFLA_EVENT_NOTIFY_PEERS; break; case NETDEV_RESEND_IGMP: rtnl_event_type = IFLA_EVENT_IGMP_RESEND; break; case NETDEV_CHANGEINFODATA: rtnl_event_type = IFLA_EVENT_BONDING_OPTIONS; break; default: break; } return rtnl_event_type; } static int put_master_ifindex(struct sk_buff *skb, struct net_device *dev) { const struct net_device *upper_dev; int ret = 0; rcu_read_lock(); upper_dev = netdev_master_upper_dev_get_rcu(dev); if (upper_dev) ret = nla_put_u32(skb, IFLA_MASTER, upper_dev->ifindex); rcu_read_unlock(); return ret; } static int nla_put_iflink(struct sk_buff *skb, const struct net_device *dev, bool force) { int ifindex = dev_get_iflink(dev); if (force || dev->ifindex != ifindex) return nla_put_u32(skb, IFLA_LINK, ifindex); return 0; } static noinline_for_stack int nla_put_ifalias(struct sk_buff *skb, struct net_device *dev) { char buf[IFALIASZ]; int ret; ret = dev_get_alias(dev, buf, sizeof(buf)); return ret > 0 ? nla_put_string(skb, IFLA_IFALIAS, buf) : 0; } static int rtnl_fill_link_netnsid(struct sk_buff *skb, const struct net_device *dev, struct net *src_net, gfp_t gfp) { bool put_iflink = false; if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) { struct net *link_net = dev->rtnl_link_ops->get_link_net(dev); if (!net_eq(dev_net(dev), link_net)) { int id = peernet2id_alloc(src_net, link_net, gfp); if (nla_put_s32(skb, IFLA_LINK_NETNSID, id)) return -EMSGSIZE; put_iflink = true; } } return nla_put_iflink(skb, dev, put_iflink); } static int rtnl_fill_link_af(struct sk_buff *skb, const struct net_device *dev, u32 ext_filter_mask) { const struct rtnl_af_ops *af_ops; struct nlattr *af_spec; af_spec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!af_spec) return -EMSGSIZE; list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { struct nlattr *af; int err; if (!af_ops->fill_link_af) continue; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) return -EMSGSIZE; err = af_ops->fill_link_af(skb, dev, ext_filter_mask); /* * Caller may return ENODATA to indicate that there * was no data to be dumped. This is not an error, it * means we should trim the attribute header and * continue. */ if (err == -ENODATA) nla_nest_cancel(skb, af); else if (err < 0) return -EMSGSIZE; nla_nest_end(skb, af); } nla_nest_end(skb, af_spec); return 0; } static int rtnl_fill_alt_ifnames(struct sk_buff *skb, const struct net_device *dev) { struct netdev_name_node *name_node; int count = 0; list_for_each_entry(name_node, &dev->name_node->list, list) { if (nla_put_string(skb, IFLA_ALT_IFNAME, name_node->name)) return -EMSGSIZE; count++; } return count; } static int rtnl_fill_prop_list(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *prop_list; int ret; prop_list = nla_nest_start(skb, IFLA_PROP_LIST); if (!prop_list) return -EMSGSIZE; ret = rtnl_fill_alt_ifnames(skb, dev); if (ret <= 0) goto nest_cancel; nla_nest_end(skb, prop_list); return 0; nest_cancel: nla_nest_cancel(skb, prop_list); return ret; } static int rtnl_fill_proto_down(struct sk_buff *skb, const struct net_device *dev) { struct nlattr *pr; u32 preason; if (nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; preason = dev->proto_down_reason; if (!preason) return 0; pr = nla_nest_start(skb, IFLA_PROTO_DOWN_REASON); if (!pr) return -EMSGSIZE; if (nla_put_u32(skb, IFLA_PROTO_DOWN_REASON_VALUE, preason)) { nla_nest_cancel(skb, pr); goto nla_put_failure; } nla_nest_end(skb, pr); return 0; nla_put_failure: return -EMSGSIZE; } static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, u32 event, int *new_nsid, int new_ifindex, int tgt_netnsid, gfp_t gfp) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; struct Qdisc *qdisc; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_UNSPEC; ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = change; if (tgt_netnsid >= 0 && nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid)) goto nla_put_failure; qdisc = rtnl_dereference(dev->qdisc); if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_TXQLEN, dev->tx_queue_len) || nla_put_u8(skb, IFLA_OPERSTATE, netif_running(dev) ? dev->operstate : IF_OPER_DOWN) || nla_put_u8(skb, IFLA_LINKMODE, dev->link_mode) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u32(skb, IFLA_MIN_MTU, dev->min_mtu) || nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || #ifdef CONFIG_RPS nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) || #endif put_master_ifindex(skb, dev) || nla_put_u8(skb, IFLA_CARRIER, netif_carrier_ok(dev)) || (qdisc && nla_put_string(skb, IFLA_QDISC, qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, atomic_read(&dev->carrier_up_count) + atomic_read(&dev->carrier_down_count)) || nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, atomic_read(&dev->carrier_down_count))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) goto nla_put_failure; if (event != IFLA_EVENT_NONE) { if (nla_put_u32(skb, IFLA_EVENT, event)) goto nla_put_failure; } if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure; if (dev->addr_len) { if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) || nla_put(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast)) goto nla_put_failure; } if (rtnl_phys_port_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_port_name_fill(skb, dev)) goto nla_put_failure; if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; if (rtnl_fill_stats(skb, dev)) goto nla_put_failure; if (rtnl_fill_vf(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_port_fill(skb, dev, ext_filter_mask)) goto nla_put_failure; if (rtnl_xdp_fill(skb, dev)) goto nla_put_failure; if (dev->rtnl_link_ops || rtnl_have_link_slave_info(dev)) { if (rtnl_link_fill(skb, dev) < 0) goto nla_put_failure; } if (rtnl_fill_link_netnsid(skb, dev, src_net, gfp)) goto nla_put_failure; if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; if (new_ifindex && nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) goto nla_put_failure; if (memchr_inv(dev->perm_addr, '\0', dev->addr_len) && nla_put(skb, IFLA_PERM_ADDRESS, dev->addr_len, dev->perm_addr)) goto nla_put_failure; rcu_read_lock(); if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) goto nla_put_failure_rcu; rcu_read_unlock(); if (rtnl_fill_prop_list(skb, dev)) goto nla_put_failure; if (dev->dev.parent && nla_put_string(skb, IFLA_PARENT_DEV_NAME, dev_name(dev->dev.parent))) goto nla_put_failure; if (dev->dev.parent && dev->dev.parent->bus && nla_put_string(skb, IFLA_PARENT_DEV_BUS_NAME, dev->dev.parent->bus->name)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure_rcu: rcu_read_unlock(); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_IFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ-1 }, [IFLA_ADDRESS] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_BROADCAST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [IFLA_MAP] = { .len = sizeof(struct rtnl_link_ifmap) }, [IFLA_MTU] = { .type = NLA_U32 }, [IFLA_LINK] = { .type = NLA_U32 }, [IFLA_MASTER] = { .type = NLA_U32 }, [IFLA_CARRIER] = { .type = NLA_U8 }, [IFLA_TXQLEN] = { .type = NLA_U32 }, [IFLA_WEIGHT] = { .type = NLA_U32 }, [IFLA_OPERSTATE] = { .type = NLA_U8 }, [IFLA_LINKMODE] = { .type = NLA_U8 }, [IFLA_LINKINFO] = { .type = NLA_NESTED }, [IFLA_NET_NS_PID] = { .type = NLA_U32 }, [IFLA_NET_NS_FD] = { .type = NLA_U32 }, /* IFLA_IFALIAS is a string, but policy is set to NLA_BINARY to * allow 0-length string (needed to remove an alias). */ [IFLA_IFALIAS] = { .type = NLA_BINARY, .len = IFALIASZ - 1 }, [IFLA_VFINFO_LIST] = {. type = NLA_NESTED }, [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, [IFLA_AF_SPEC] = { .type = NLA_NESTED }, [IFLA_EXT_MASK] = { .type = NLA_U32 }, [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, [IFLA_TARGET_NETNSID] = { .type = NLA_S32 }, [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, [IFLA_MIN_MTU] = { .type = NLA_U32 }, [IFLA_MAX_MTU] = { .type = NLA_U32 }, [IFLA_PROP_LIST] = { .type = NLA_NESTED }, [IFLA_ALT_IFNAME] = { .type = NLA_STRING, .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), [IFLA_PARENT_DEV_NAME] = { .type = NLA_NUL_STRING }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { [IFLA_INFO_KIND] = { .type = NLA_STRING }, [IFLA_INFO_DATA] = { .type = NLA_NESTED }, [IFLA_INFO_SLAVE_KIND] = { .type = NLA_STRING }, [IFLA_INFO_SLAVE_DATA] = { .type = NLA_NESTED }, }; static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = { [IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) }, [IFLA_VF_BROADCAST] = { .type = NLA_REJECT }, [IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) }, [IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED }, [IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) }, [IFLA_VF_SPOOFCHK] = { .len = sizeof(struct ifla_vf_spoofchk) }, [IFLA_VF_RATE] = { .len = sizeof(struct ifla_vf_rate) }, [IFLA_VF_LINK_STATE] = { .len = sizeof(struct ifla_vf_link_state) }, [IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) }, [IFLA_VF_STATS] = { .type = NLA_NESTED }, [IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) }, [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) }, [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) }, }; static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = { [IFLA_PORT_VF] = { .type = NLA_U32 }, [IFLA_PORT_PROFILE] = { .type = NLA_STRING, .len = PORT_PROFILE_MAX }, [IFLA_PORT_INSTANCE_UUID] = { .type = NLA_BINARY, .len = PORT_UUID_MAX }, [IFLA_PORT_HOST_UUID] = { .type = NLA_STRING, .len = PORT_UUID_MAX }, [IFLA_PORT_REQUEST] = { .type = NLA_U8, }, [IFLA_PORT_RESPONSE] = { .type = NLA_U16, }, /* Unused, but we need to keep it here since user space could * fill it. It's also broken with regard to NLA_BINARY use in * combination with structs. */ [IFLA_PORT_VSI_TYPE] = { .type = NLA_BINARY, .len = sizeof(struct ifla_port_vsi) }, }; static const struct nla_policy ifla_xdp_policy[IFLA_XDP_MAX + 1] = { [IFLA_XDP_UNSPEC] = { .strict_start_type = IFLA_XDP_EXPECTED_FD }, [IFLA_XDP_FD] = { .type = NLA_S32 }, [IFLA_XDP_EXPECTED_FD] = { .type = NLA_S32 }, [IFLA_XDP_ATTACHED] = { .type = NLA_U8 }, [IFLA_XDP_FLAGS] = { .type = NLA_U32 }, [IFLA_XDP_PROG_ID] = { .type = NLA_U32 }, }; static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla) { const struct rtnl_link_ops *ops = NULL; struct nlattr *linfo[IFLA_INFO_MAX + 1]; if (nla_parse_nested_deprecated(linfo, IFLA_INFO_MAX, nla, ifla_info_policy, NULL) < 0) return NULL; if (linfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } return ops; } static bool link_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; if (!master_idx) return false; master = netdev_master_upper_dev_get(dev); /* 0 is already used to denote IFLA_MASTER wasn't passed, therefore need * another invalid value for ifindex to denote "no master". */ if (master_idx == -1) return !!master; if (!master || master->ifindex != master_idx) return true; return false; } static bool link_kind_filtered(const struct net_device *dev, const struct rtnl_link_ops *kind_ops) { if (kind_ops && dev->rtnl_link_ops != kind_ops) return true; return false; } static bool link_dump_filtered(struct net_device *dev, int master_idx, const struct rtnl_link_ops *kind_ops) { if (link_master_filtered(dev, master_idx) || link_kind_filtered(dev, kind_ops)) return true; return false; } /** * rtnl_get_net_ns_capable - Get netns if sufficiently privileged. * @sk: netlink socket * @netnsid: network namespace identifier * * Returns the network namespace identified by netnsid on success or an error * pointer on failure. */ struct net *rtnl_get_net_ns_capable(struct sock *sk, int netnsid) { struct net *net; net = get_net_ns_by_id(sock_net(sk), netnsid); if (!net) return ERR_PTR(-EINVAL); /* For now, the caller is required to have CAP_NET_ADMIN in * the user namespace owning the target net ns. */ if (!sk_ns_capable(sk, net->user_ns, CAP_NET_ADMIN)) { put_net(net); return ERR_PTR(-EACCES); } return net; } EXPORT_SYMBOL_GPL(rtnl_get_net_ns_capable); static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, bool strict_check, struct nlattr **tb, struct netlink_ext_ack *extack) { int hdrlen; if (strict_check) { struct ifinfomsg *ifm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for link dump"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for link dump request"); return -EINVAL; } if (ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Filter by device index not supported for link dumps"); return -EINVAL; } return nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); } /* A hack to preserve kernel<->userspace interface. * The correct header is ifinfomsg. It is consistent with rtnl_getlink. * However, before Linux v3.9 the code here assumed rtgenmsg and that's * what iproute2 < v3.9.0 used. * We can detect the old iproute2. Even including the IFLA_EXT_MASK * attribute, its netlink message is shorter than struct ifinfomsg. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); return nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, extack); } static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net *tgt_net = net; int h, s_h; int idx = 0, s_idx; struct net_device *dev; struct hlist_head *head; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; const struct rtnl_link_ops *kind_ops = NULL; unsigned int flags = NLM_F_MULTI; int master_idx = 0; int netnsid = -1; int err, i; s_h = cb->args[0]; s_idx = cb->args[1]; err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); if (err < 0) { if (cb->strict_check) return err; goto walk_entries; } for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; /* new attributes should only be added with strict checking */ switch (i) { case IFLA_TARGET_NETNSID: netnsid = nla_get_s32(tb[i]); tgt_net = rtnl_get_net_ns_capable(skb->sk, netnsid); if (IS_ERR(tgt_net)) { NL_SET_ERR_MSG(extack, "Invalid target network namespace id"); return PTR_ERR(tgt_net); } break; case IFLA_EXT_MASK: ext_filter_mask = nla_get_u32(tb[i]); break; case IFLA_MASTER: master_idx = nla_get_u32(tb[i]); break; case IFLA_LINKINFO: kind_ops = linkinfo_to_kind_ops(tb[i]); break; default: if (cb->strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in link dump request"); return -EINVAL; } } } if (master_idx || kind_ops) flags |= NLM_F_DUMP_FILTERED; walk_entries: for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &tgt_net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { if (link_dump_filtered(dev, master_idx, kind_ops)) goto cont; if (idx < s_idx) goto cont; err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, NETLINK_CB(cb->skb).portid, nlh->nlmsg_seq, 0, flags, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) { if (likely(skb->len)) goto out; goto out_err; } cont: idx++; } } out: err = skb->len; out_err: cb->args[1] = idx; cb->args[0] = h; cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer, struct netlink_ext_ack *exterr) { const struct ifinfomsg *ifmp; const struct nlattr *attrs; size_t len; ifmp = nla_data(nla_peer); attrs = nla_data(nla_peer) + sizeof(struct ifinfomsg); len = nla_len(nla_peer) - sizeof(struct ifinfomsg); if (ifmp->ifi_index < 0) { NL_SET_ERR_MSG_ATTR(exterr, nla_peer, "ifindex can't be negative"); return -EINVAL; } return nla_parse_deprecated(tb, IFLA_MAX, attrs, len, ifla_policy, exterr); } EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg); struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) { struct net *net; /* Examine the link attributes and figure out which * network namespace we are talking about. */ if (tb[IFLA_NET_NS_PID]) net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID])); else if (tb[IFLA_NET_NS_FD]) net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD])); else net = get_net(src_net); return net; } EXPORT_SYMBOL(rtnl_link_get_net); /* Figure out which network namespace we are talking about by * examining the link attributes in the following order: * * 1. IFLA_NET_NS_PID * 2. IFLA_NET_NS_FD * 3. IFLA_TARGET_NETNSID */ static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, struct nlattr *tb[]) { struct net *net; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) return rtnl_link_get_net(src_net, tb); if (!tb[IFLA_TARGET_NETNSID]) return get_net(src_net); net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_TARGET_NETNSID])); if (!net) return ERR_PTR(-EINVAL); return net; } static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, struct net *src_net, struct nlattr *tb[], int cap) { struct net *net; net = rtnl_link_get_net_by_nlattr(src_net, tb); if (IS_ERR(net)) return net; if (!netlink_ns_capable(skb, net->user_ns, cap)) { put_net(net); return ERR_PTR(-EPERM); } return net; } /* Verify that rtnetlink requests do not pass additional properties * potentially referring to different network namespaces. */ static int rtnl_ensure_unique_netns(struct nlattr *tb[], struct netlink_ext_ack *extack, bool netns_id_only) { if (netns_id_only) { if (!tb[IFLA_NET_NS_PID] && !tb[IFLA_NET_NS_FD]) return 0; NL_SET_ERR_MSG(extack, "specified netns attribute not supported"); return -EOPNOTSUPP; } if (tb[IFLA_TARGET_NETNSID] && (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_PID] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_FD])) goto invalid_attr; if (tb[IFLA_NET_NS_FD] && (tb[IFLA_TARGET_NETNSID] || tb[IFLA_NET_NS_PID])) goto invalid_attr; return 0; invalid_attr: NL_SET_ERR_MSG(extack, "multiple netns identifying attributes specified"); return -EINVAL; } static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { if (dev) { if (tb[IFLA_ADDRESS] && nla_len(tb[IFLA_ADDRESS]) < dev->addr_len) return -EINVAL; if (tb[IFLA_BROADCAST] && nla_len(tb[IFLA_BROADCAST]) < dev->addr_len) return -EINVAL; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem, err; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; af_ops = rtnl_af_lookup(nla_type(af)); if (!af_ops) return -EAFNOSUPPORT; if (!af_ops->set_link_af) return -EOPNOTSUPP; if (af_ops->validate_link_af) { err = af_ops->validate_link_af(dev, af, extack); if (err < 0) return err; } } } return 0; } static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { const struct net_device_ops *ops = dev->netdev_ops; return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type); } static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type) { if (dev->type != ARPHRD_INFINIBAND) return -EOPNOTSUPP; return handle_infiniband_guid(dev, ivt, guid_type); } static int do_setvfinfo(struct net_device *dev, struct nlattr **tb) { const struct net_device_ops *ops = dev->netdev_ops; int err = -EINVAL; if (tb[IFLA_VF_MAC]) { struct ifla_vf_mac *ivm = nla_data(tb[IFLA_VF_MAC]); if (ivm->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_mac) err = ops->ndo_set_vf_mac(dev, ivm->vf, ivm->mac); if (err < 0) return err; } if (tb[IFLA_VF_VLAN]) { struct ifla_vf_vlan *ivv = nla_data(tb[IFLA_VF_VLAN]); if (ivv->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_vlan) err = ops->ndo_set_vf_vlan(dev, ivv->vf, ivv->vlan, ivv->qos, htons(ETH_P_8021Q)); if (err < 0) return err; } if (tb[IFLA_VF_VLAN_LIST]) { struct ifla_vf_vlan_info *ivvl[MAX_VLAN_LIST_LEN]; struct nlattr *attr; int rem, len = 0; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_vlan) return err; nla_for_each_nested(attr, tb[IFLA_VF_VLAN_LIST], rem) { if (nla_type(attr) != IFLA_VF_VLAN_INFO || nla_len(attr) < sizeof(struct ifla_vf_vlan_info)) { return -EINVAL; } if (len >= MAX_VLAN_LIST_LEN) return -EOPNOTSUPP; ivvl[len] = nla_data(attr); len++; } if (len == 0) return -EINVAL; if (ivvl[0]->vf >= INT_MAX) return -EINVAL; err = ops->ndo_set_vf_vlan(dev, ivvl[0]->vf, ivvl[0]->vlan, ivvl[0]->qos, ivvl[0]->vlan_proto); if (err < 0) return err; } if (tb[IFLA_VF_TX_RATE]) { struct ifla_vf_tx_rate *ivt = nla_data(tb[IFLA_VF_TX_RATE]); struct ifla_vf_info ivf; if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_get_vf_config) err = ops->ndo_get_vf_config(dev, ivt->vf, &ivf); if (err < 0) return err; err = -EOPNOTSUPP; if (ops->ndo_set_vf_rate) err = ops->ndo_set_vf_rate(dev, ivt->vf, ivf.min_tx_rate, ivt->rate); if (err < 0) return err; } if (tb[IFLA_VF_RATE]) { struct ifla_vf_rate *ivt = nla_data(tb[IFLA_VF_RATE]); if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_rate) err = ops->ndo_set_vf_rate(dev, ivt->vf, ivt->min_tx_rate, ivt->max_tx_rate); if (err < 0) return err; } if (tb[IFLA_VF_SPOOFCHK]) { struct ifla_vf_spoofchk *ivs = nla_data(tb[IFLA_VF_SPOOFCHK]); if (ivs->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_spoofchk) err = ops->ndo_set_vf_spoofchk(dev, ivs->vf, ivs->setting); if (err < 0) return err; } if (tb[IFLA_VF_LINK_STATE]) { struct ifla_vf_link_state *ivl = nla_data(tb[IFLA_VF_LINK_STATE]); if (ivl->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_link_state) err = ops->ndo_set_vf_link_state(dev, ivl->vf, ivl->link_state); if (err < 0) return err; } if (tb[IFLA_VF_RSS_QUERY_EN]) { struct ifla_vf_rss_query_en *ivrssq_en; err = -EOPNOTSUPP; ivrssq_en = nla_data(tb[IFLA_VF_RSS_QUERY_EN]); if (ivrssq_en->vf >= INT_MAX) return -EINVAL; if (ops->ndo_set_vf_rss_query_en) err = ops->ndo_set_vf_rss_query_en(dev, ivrssq_en->vf, ivrssq_en->setting); if (err < 0) return err; } if (tb[IFLA_VF_TRUST]) { struct ifla_vf_trust *ivt = nla_data(tb[IFLA_VF_TRUST]); if (ivt->vf >= INT_MAX) return -EINVAL; err = -EOPNOTSUPP; if (ops->ndo_set_vf_trust) err = ops->ndo_set_vf_trust(dev, ivt->vf, ivt->setting); if (err < 0) return err; } if (tb[IFLA_VF_IB_NODE_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]); if (ivt->vf >= INT_MAX) return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID); } if (tb[IFLA_VF_IB_PORT_GUID]) { struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]); if (ivt->vf >= INT_MAX) return -EINVAL; if (!ops->ndo_set_vf_guid) return -EOPNOTSUPP; return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID); } return err; } static int do_set_master(struct net_device *dev, int ifindex, struct netlink_ext_ack *extack) { struct net_device *upper_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops; int err; if (upper_dev) { if (upper_dev->ifindex == ifindex) return 0; ops = upper_dev->netdev_ops; if (ops->ndo_del_slave) { err = ops->ndo_del_slave(upper_dev, dev); if (err) return err; } else { return -EOPNOTSUPP; } } if (ifindex) { upper_dev = __dev_get_by_index(dev_net(dev), ifindex); if (!upper_dev) return -EINVAL; ops = upper_dev->netdev_ops; if (ops->ndo_add_slave) { err = ops->ndo_add_slave(upper_dev, dev, extack); if (err) return err; } else { return -EOPNOTSUPP; } } return 0; } static const struct nla_policy ifla_proto_down_reason_policy[IFLA_PROTO_DOWN_REASON_VALUE + 1] = { [IFLA_PROTO_DOWN_REASON_MASK] = { .type = NLA_U32 }, [IFLA_PROTO_DOWN_REASON_VALUE] = { .type = NLA_U32 }, }; static int do_set_proto_down(struct net_device *dev, struct nlattr *nl_proto_down, struct nlattr *nl_proto_down_reason, struct netlink_ext_ack *extack) { struct nlattr *pdreason[IFLA_PROTO_DOWN_REASON_MAX + 1]; const struct net_device_ops *ops = dev->netdev_ops; unsigned long mask = 0; u32 value; bool proto_down; int err; if (!ops->ndo_change_proto_down) { NL_SET_ERR_MSG(extack, "Protodown not supported by device"); return -EOPNOTSUPP; } if (nl_proto_down_reason) { err = nla_parse_nested_deprecated(pdreason, IFLA_PROTO_DOWN_REASON_MAX, nl_proto_down_reason, ifla_proto_down_reason_policy, NULL); if (err < 0) return err; if (!pdreason[IFLA_PROTO_DOWN_REASON_VALUE]) { NL_SET_ERR_MSG(extack, "Invalid protodown reason value"); return -EINVAL; } value = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_VALUE]); if (pdreason[IFLA_PROTO_DOWN_REASON_MASK]) mask = nla_get_u32(pdreason[IFLA_PROTO_DOWN_REASON_MASK]); dev_change_proto_down_reason(dev, mask, value); } if (nl_proto_down) { proto_down = nla_get_u8(nl_proto_down); /* Don't turn off protodown if there are active reasons */ if (!proto_down && dev->proto_down_reason) { NL_SET_ERR_MSG(extack, "Cannot clear protodown, active reasons"); return -EBUSY; } err = dev_change_proto_down(dev, proto_down); if (err) return err; } return 0; } #define DO_SETLINK_MODIFIED 0x01 /* notify flag means notify + modified. */ #define DO_SETLINK_NOTIFY 0x03 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb, int status) { const struct net_device_ops *ops = dev->netdev_ops; char ifname[IFNAMSIZ]; int err; err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { const char *pat = ifname[0] ? ifname : NULL; struct net *net; int new_ifindex; net = rtnl_link_get_net_capable(skb, dev_net(dev), tb, CAP_NET_ADMIN); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; } if (tb[IFLA_NEW_IFINDEX]) new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); else new_ifindex = 0; err = __dev_change_net_namespace(dev, net, pat, new_ifindex); put_net(net); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MAP]) { struct rtnl_link_ifmap *u_map; struct ifmap k_map; if (!ops->ndo_set_config) { err = -EOPNOTSUPP; goto errout; } if (!netif_device_present(dev)) { err = -ENODEV; goto errout; } u_map = nla_data(tb[IFLA_MAP]); k_map.mem_start = (unsigned long) u_map->mem_start; k_map.mem_end = (unsigned long) u_map->mem_end; k_map.base_addr = (unsigned short) u_map->base_addr; k_map.irq = (unsigned char) u_map->irq; k_map.dma = (unsigned char) u_map->dma; k_map.port = (unsigned char) u_map->port; err = ops->ndo_set_config(dev, &k_map); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_ADDRESS]) { struct sockaddr *sa; int len; len = sizeof(sa_family_t) + max_t(size_t, dev->addr_len, sizeof(*sa)); sa = kmalloc(len, GFP_KERNEL); if (!sa) { err = -ENOMEM; goto errout; } sa->sa_family = dev->type; memcpy(sa->sa_data, nla_data(tb[IFLA_ADDRESS]), dev->addr_len); err = dev_set_mac_address_user(dev, sa, extack); kfree(sa); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_MTU]) { err = dev_set_mtu_ext(dev, nla_get_u32(tb[IFLA_MTU]), extack); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GROUP]) { dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); status |= DO_SETLINK_NOTIFY; } /* * Interface selected by interface index but interface * name provided implies that a name change has been * requested. */ if (ifm->ifi_index > 0 && ifname[0]) { err = dev_change_name(dev, ifname); if (err < 0) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_IFALIAS]) { err = dev_set_alias(dev, nla_data(tb[IFLA_IFALIAS]), nla_len(tb[IFLA_IFALIAS])); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_BROADCAST]) { nla_memcpy(dev->broadcast, tb[IFLA_BROADCAST], dev->addr_len); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } if (ifm->ifi_flags || ifm->ifi_change) { err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), extack); if (err < 0) goto errout; } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_TXQLEN]) { unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); err = dev_change_tx_queue_len(dev, value); if (err) goto errout; status |= DO_SETLINK_MODIFIED; } if (tb[IFLA_GSO_MAX_SIZE]) { u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); if (max_size > GSO_MAX_SIZE) { err = -EINVAL; goto errout; } if (dev->gso_max_size ^ max_size) { netif_set_gso_max_size(dev, max_size); status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_GSO_MAX_SEGS]) { u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); if (max_segs > GSO_MAX_SEGS) { err = -EINVAL; goto errout; } if (dev->gso_max_segs ^ max_segs) { dev->gso_max_segs = max_segs; status |= DO_SETLINK_MODIFIED; } } if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; dev->link_mode = value; write_unlock(&dev_base_lock); } if (tb[IFLA_VFINFO_LIST]) { struct nlattr *vfinfo[IFLA_VF_MAX + 1]; struct nlattr *attr; int rem; nla_for_each_nested(attr, tb[IFLA_VFINFO_LIST], rem) { if (nla_type(attr) != IFLA_VF_INFO || nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } err = nla_parse_nested_deprecated(vfinfo, IFLA_VF_MAX, attr, ifla_vf_policy, NULL); if (err < 0) goto errout; err = do_setvfinfo(dev, vfinfo); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_VF_PORTS]) { struct nlattr *port[IFLA_PORT_MAX+1]; struct nlattr *attr; int vf; int rem; err = -EOPNOTSUPP; if (!ops->ndo_set_vf_port) goto errout; nla_for_each_nested(attr, tb[IFLA_VF_PORTS], rem) { if (nla_type(attr) != IFLA_VF_PORT || nla_len(attr) < NLA_HDRLEN) { err = -EINVAL; goto errout; } err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, attr, ifla_port_policy, NULL); if (err < 0) goto errout; if (!port[IFLA_PORT_VF]) { err = -EOPNOTSUPP; goto errout; } vf = nla_get_u32(port[IFLA_PORT_VF]); err = ops->ndo_set_vf_port(dev, vf, port); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_PORT_SELF]) { struct nlattr *port[IFLA_PORT_MAX+1]; err = nla_parse_nested_deprecated(port, IFLA_PORT_MAX, tb[IFLA_PORT_SELF], ifla_port_policy, NULL); if (err < 0) goto errout; err = -EOPNOTSUPP; if (ops->ndo_set_vf_port) err = ops->ndo_set_vf_port(dev, PORT_SELF_VF, port); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem; nla_for_each_nested(af, tb[IFLA_AF_SPEC], rem) { const struct rtnl_af_ops *af_ops; BUG_ON(!(af_ops = rtnl_af_lookup(nla_type(af)))); err = af_ops->set_link_af(dev, af, extack); if (err < 0) goto errout; status |= DO_SETLINK_NOTIFY; } } err = 0; if (tb[IFLA_PROTO_DOWN] || tb[IFLA_PROTO_DOWN_REASON]) { err = do_set_proto_down(dev, tb[IFLA_PROTO_DOWN], tb[IFLA_PROTO_DOWN_REASON], extack); if (err) goto errout; status |= DO_SETLINK_NOTIFY; } if (tb[IFLA_XDP]) { struct nlattr *xdp[IFLA_XDP_MAX + 1]; u32 xdp_flags = 0; err = nla_parse_nested_deprecated(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], ifla_xdp_policy, NULL); if (err < 0) goto errout; if (xdp[IFLA_XDP_ATTACHED] || xdp[IFLA_XDP_PROG_ID]) { err = -EINVAL; goto errout; } if (xdp[IFLA_XDP_FLAGS]) { xdp_flags = nla_get_u32(xdp[IFLA_XDP_FLAGS]); if (xdp_flags & ~XDP_FLAGS_MASK) { err = -EINVAL; goto errout; } if (hweight32(xdp_flags & XDP_FLAGS_MODES) > 1) { err = -EINVAL; goto errout; } } if (xdp[IFLA_XDP_FD]) { int expected_fd = -1; if (xdp_flags & XDP_FLAGS_REPLACE) { if (!xdp[IFLA_XDP_EXPECTED_FD]) { err = -EINVAL; goto errout; } expected_fd = nla_get_s32(xdp[IFLA_XDP_EXPECTED_FD]); } err = dev_change_xdp_fd(dev, extack, nla_get_s32(xdp[IFLA_XDP_FD]), expected_fd, xdp_flags); if (err) goto errout; status |= DO_SETLINK_NOTIFY; } } errout: if (status & DO_SETLINK_MODIFIED) { if ((status & DO_SETLINK_NOTIFY) == DO_SETLINK_NOTIFY) netdev_state_change(dev); if (err < 0) net_warn_ratelimited("A link change request failed with some changes committed already. Interface %s may have been left with an inconsistent configuration, please check.\n", dev->name); } return err; } static struct net_device *rtnl_dev_get(struct net *net, struct nlattr *tb[]) { char ifname[ALTIFNAMSIZ]; if (tb[IFLA_IFNAME]) nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else if (tb[IFLA_ALT_IFNAME]) nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ); else return NULL; return __dev_get_by_name(net, ifname); } static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; int err; struct nlattr *tb[IFLA_MAX+1]; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) goto errout; err = rtnl_ensure_unique_netns(tb, extack, false); if (err < 0) goto errout; err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else goto errout; if (dev == NULL) { err = -ENODEV; goto errout; } err = do_setlink(skb, dev, ifm, extack, tb, 0); errout: return err; } static int rtnl_group_dellink(const struct net *net, int group) { struct net_device *dev, *aux; LIST_HEAD(list_kill); bool found = false; if (!group) return -EPERM; for_each_netdev(net, dev) { if (dev->group == group) { const struct rtnl_link_ops *ops; found = true; ops = dev->rtnl_link_ops; if (!ops || !ops->dellink) return -EOPNOTSUPP; } } if (!found) return -ENODEV; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { const struct rtnl_link_ops *ops; ops = dev->rtnl_link_ops; ops->dellink(dev, &list_kill); } } unregister_netdevice_many(&list_kill); return 0; } int rtnl_delete_link(struct net_device *dev) { const struct rtnl_link_ops *ops; LIST_HEAD(list_kill); ops = dev->rtnl_link_ops; if (!ops || !ops->dellink) return -EOPNOTSUPP; ops->dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); return 0; } EXPORT_SYMBOL_GPL(rtnl_delete_link); static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct net_device *dev = NULL; struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; int err; int netnsid = -1; err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err < 0) return err; if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); else if (tb[IFLA_GROUP]) err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else goto out; if (!dev) { if (tb[IFLA_IFNAME] || ifm->ifi_index > 0) err = -ENODEV; goto out; } err = rtnl_delete_link(dev); out: if (netnsid >= 0) put_net(tgt_net); return err; } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) { unsigned int old_flags; int err; old_flags = dev->flags; if (ifm && (ifm->ifi_flags || ifm->ifi_change)) { err = __dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), NULL); if (err < 0) return err; } if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { __dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags)); } else { dev->rtnl_link_state = RTNL_LINK_INITIALIZED; __dev_notify_flags(dev, old_flags, ~0U); } return 0; } EXPORT_SYMBOL(rtnl_configure_link); struct net_device *rtnl_create_link(struct net *net, const char *ifname, unsigned char name_assign_type, const struct rtnl_link_ops *ops, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct net_device *dev; unsigned int num_tx_queues = 1; unsigned int num_rx_queues = 1; int err; if (tb[IFLA_NUM_TX_QUEUES]) num_tx_queues = nla_get_u32(tb[IFLA_NUM_TX_QUEUES]); else if (ops->get_num_tx_queues) num_tx_queues = ops->get_num_tx_queues(); if (tb[IFLA_NUM_RX_QUEUES]) num_rx_queues = nla_get_u32(tb[IFLA_NUM_RX_QUEUES]); else if (ops->get_num_rx_queues) num_rx_queues = ops->get_num_rx_queues(); if (num_tx_queues < 1 || num_tx_queues > 4096) { NL_SET_ERR_MSG(extack, "Invalid number of transmit queues"); return ERR_PTR(-EINVAL); } if (num_rx_queues < 1 || num_rx_queues > 4096) { NL_SET_ERR_MSG(extack, "Invalid number of receive queues"); return ERR_PTR(-EINVAL); } if (ops->alloc) { dev = ops->alloc(tb, ifname, name_assign_type, num_tx_queues, num_rx_queues); if (IS_ERR(dev)) return dev; } else { dev = alloc_netdev_mqs(ops->priv_size, ifname, name_assign_type, ops->setup, num_tx_queues, num_rx_queues); } if (!dev) return ERR_PTR(-ENOMEM); err = validate_linkmsg(dev, tb, extack); if (err < 0) { free_netdev(dev); return ERR_PTR(err); } dev_net_set(dev, net); dev->rtnl_link_ops = ops; dev->rtnl_link_state = RTNL_LINK_INITIALIZING; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); err = dev_validate_mtu(dev, mtu, extack); if (err) { free_netdev(dev); return ERR_PTR(err); } dev->mtu = mtu; } if (tb[IFLA_ADDRESS]) { memcpy(dev->dev_addr, nla_data(tb[IFLA_ADDRESS]), nla_len(tb[IFLA_ADDRESS])); dev->addr_assign_type = NET_ADDR_SET; } if (tb[IFLA_BROADCAST]) memcpy(dev->broadcast, nla_data(tb[IFLA_BROADCAST]), nla_len(tb[IFLA_BROADCAST])); if (tb[IFLA_TXQLEN]) dev->tx_queue_len = nla_get_u32(tb[IFLA_TXQLEN]); if (tb[IFLA_OPERSTATE]) set_operstate(dev, nla_get_u8(tb[IFLA_OPERSTATE])); if (tb[IFLA_LINKMODE]) dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); if (tb[IFLA_GSO_MAX_SIZE]) netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); if (tb[IFLA_GSO_MAX_SEGS]) dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); return dev; } EXPORT_SYMBOL(rtnl_create_link); static int rtnl_group_changelink(const struct sk_buff *skb, struct net *net, int group, struct ifinfomsg *ifm, struct netlink_ext_ack *extack, struct nlattr **tb) { struct net_device *dev, *aux; int err; for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { err = do_setlink(skb, dev, ifm, extack, tb, 0); if (err < 0) return err; } } return 0; } static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr **attr, struct netlink_ext_ack *extack) { struct nlattr *slave_attr[RTNL_SLAVE_MAX_TYPE + 1]; unsigned char name_assign_type = NET_NAME_USER; struct nlattr *linkinfo[IFLA_INFO_MAX + 1]; const struct rtnl_link_ops *m_ops; struct net_device *master_dev; struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; struct nlattr *tb[IFLA_MAX + 1]; struct net *dest_net, *link_net; struct nlattr **slave_data; char kind[MODULE_NAME_LEN]; struct net_device *dev; struct ifinfomsg *ifm; char ifname[IFNAMSIZ]; struct nlattr **data; bool link_specified; int err; #ifdef CONFIG_MODULES replay: #endif err = nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, false); if (err < 0) return err; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) { link_specified = true; dev = __dev_get_by_index(net, ifm->ifi_index); } else if (ifm->ifi_index < 0) { NL_SET_ERR_MSG(extack, "ifindex can't be negative"); return -EINVAL; } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) { link_specified = true; dev = rtnl_dev_get(net, tb); } else { link_specified = false; dev = NULL; } master_dev = NULL; m_ops = NULL; if (dev) { master_dev = netdev_master_upper_dev_get(dev); if (master_dev) m_ops = master_dev->rtnl_link_ops; } err = validate_linkmsg(dev, tb, extack); if (err < 0) return err; if (tb[IFLA_LINKINFO]) { err = nla_parse_nested_deprecated(linkinfo, IFLA_INFO_MAX, tb[IFLA_LINKINFO], ifla_info_policy, NULL); if (err < 0) return err; } else memset(linkinfo, 0, sizeof(linkinfo)); if (linkinfo[IFLA_INFO_KIND]) { nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } else { kind[0] = '\0'; ops = NULL; } data = NULL; if (ops) { if (ops->maxtype > RTNL_MAX_TYPE) return -EINVAL; if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { err = nla_parse_nested_deprecated(attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], ops->policy, extack); if (err < 0) return err; data = attr; } if (ops->validate) { err = ops->validate(tb, data, extack); if (err < 0) return err; } } slave_data = NULL; if (m_ops) { if (m_ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE) return -EINVAL; if (m_ops->slave_maxtype && linkinfo[IFLA_INFO_SLAVE_DATA]) { err = nla_parse_nested_deprecated(slave_attr, m_ops->slave_maxtype, linkinfo[IFLA_INFO_SLAVE_DATA], m_ops->slave_policy, extack); if (err < 0) return err; slave_data = slave_attr; } } if (dev) { int status = 0; if (nlh->nlmsg_flags & NLM_F_EXCL) return -EEXIST; if (nlh->nlmsg_flags & NLM_F_REPLACE) return -EOPNOTSUPP; if (linkinfo[IFLA_INFO_DATA]) { if (!ops || ops != dev->rtnl_link_ops || !ops->changelink) return -EOPNOTSUPP; err = ops->changelink(dev, tb, data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } if (linkinfo[IFLA_INFO_SLAVE_DATA]) { if (!m_ops || !m_ops->slave_changelink) return -EOPNOTSUPP; err = m_ops->slave_changelink(master_dev, dev, tb, slave_data, extack); if (err < 0) return err; status |= DO_SETLINK_NOTIFY; } return do_setlink(skb, dev, ifm, extack, tb, status); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { /* No dev found and NLM_F_CREATE not set. Requested dev does not exist, * or it's for a group */ if (link_specified) return -ENODEV; if (tb[IFLA_GROUP]) return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), ifm, extack, tb); return -ENODEV; } if (tb[IFLA_MAP] || tb[IFLA_PROTINFO]) return -EOPNOTSUPP; if (!ops) { #ifdef CONFIG_MODULES if (kind[0]) { __rtnl_unlock(); request_module("rtnl-link-%s", kind); rtnl_lock(); ops = rtnl_link_ops_get(kind); if (ops) goto replay; } #endif NL_SET_ERR_MSG(extack, "Unknown device type"); return -EOPNOTSUPP; } if (!ops->alloc && !ops->setup) return -EOPNOTSUPP; if (tb[IFLA_IFNAME]) { nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); } else { snprintf(ifname, IFNAMSIZ, "%s%%d", ops->kind); name_assign_type = NET_NAME_ENUM; } dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); if (IS_ERR(dest_net)) return PTR_ERR(dest_net); if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); link_net = get_net_ns_by_id(dest_net, id); if (!link_net) { NL_SET_ERR_MSG(extack, "Unknown network namespace id"); err = -EINVAL; goto out; } err = -EPERM; if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) goto out; } else { link_net = NULL; } dev = rtnl_create_link(link_net ? : dest_net, ifname, name_assign_type, ops, tb, extack); if (IS_ERR(dev)) { err = PTR_ERR(dev); goto out; } dev->ifindex = ifm->ifi_index; if (ops->newlink) err = ops->newlink(link_net ? : net, dev, tb, data, extack); else err = register_netdevice(dev); if (err < 0) { free_netdev(dev); goto out; } err = rtnl_configure_link(dev, ifm); if (err < 0) goto out_unregister; if (link_net) { err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; } if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) goto out_unregister; } out: if (link_net) put_net(link_net); put_net(dest_net); return err; out_unregister: if (ops->newlink) { LIST_HEAD(list_kill); ops->dellink(dev, &list_kill); unregister_netdevice_many(&list_kill); } else { unregister_netdevice(dev); } goto out; } static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr **attr; int ret; attr = kmalloc_array(RTNL_MAX_TYPE + 1, sizeof(*attr), GFP_KERNEL); if (!attr) return -ENOMEM; ret = __rtnl_newlink(skb, nlh, attr, extack); kfree(attr); return ret; } static int rtnl_valid_getlink_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct ifinfomsg *ifm; int i, err; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for get link"); return -EINVAL; } if (!netlink_strict_get_check(skb)) return nlmsg_parse_deprecated(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change) { NL_SET_ERR_MSG(extack, "Invalid values in header for get link request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err) return err; for (i = 0; i <= IFLA_MAX; i++) { if (!tb[i]) continue; switch (i) { case IFLA_IFNAME: case IFLA_ALT_IFNAME: case IFLA_EXT_MASK: case IFLA_TARGET_NETNSID: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in get link request"); return -EINVAL; } } return 0; } static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net *tgt_net = net; struct ifinfomsg *ifm; struct nlattr *tb[IFLA_MAX+1]; struct net_device *dev = NULL; struct sk_buff *nskb; int netnsid = -1; int err; u32 ext_filter_mask = 0; err = rtnl_valid_getlink_req(skb, nlh, tb, extack); if (err < 0) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err < 0) return err; if (tb[IFLA_TARGET_NETNSID]) { netnsid = nla_get_s32(tb[IFLA_TARGET_NETNSID]); tgt_net = rtnl_get_net_ns_capable(NETLINK_CB(skb).sk, netnsid); if (IS_ERR(tgt_net)) return PTR_ERR(tgt_net); } if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(tgt_net, tb); else goto out; err = -ENODEV; if (dev == NULL) goto out; err = -ENOBUFS; nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); if (nskb == NULL) goto out; err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0, NULL, 0, netnsid, GFP_KERNEL); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); out: if (netnsid >= 0) put_net(tgt_net); return err; } static int rtnl_alt_ifname(int cmd, struct net_device *dev, struct nlattr *attr, bool *changed, struct netlink_ext_ack *extack) { char *alt_ifname; size_t size; int err; err = nla_validate(attr, attr->nla_len, IFLA_MAX, ifla_policy, extack); if (err) return err; if (cmd == RTM_NEWLINKPROP) { size = rtnl_prop_list_size(dev); size += nla_total_size(ALTIFNAMSIZ); if (size >= U16_MAX) { NL_SET_ERR_MSG(extack, "effective property list too long"); return -EINVAL; } } alt_ifname = nla_strdup(attr, GFP_KERNEL_ACCOUNT); if (!alt_ifname) return -ENOMEM; if (cmd == RTM_NEWLINKPROP) { err = netdev_name_node_alt_create(dev, alt_ifname); if (!err) alt_ifname = NULL; } else if (cmd == RTM_DELLINKPROP) { err = netdev_name_node_alt_destroy(dev, alt_ifname); } else { WARN_ON_ONCE(1); err = -EINVAL; } kfree(alt_ifname); if (!err) *changed = true; return err; } static int rtnl_linkprop(int cmd, struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFLA_MAX + 1]; struct net_device *dev; struct ifinfomsg *ifm; bool changed = false; struct nlattr *attr; int err, rem; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err) return err; err = rtnl_ensure_unique_netns(tb, extack, true); if (err) return err; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) dev = rtnl_dev_get(net, tb); else return -EINVAL; if (!dev) return -ENODEV; if (!tb[IFLA_PROP_LIST]) return 0; nla_for_each_nested(attr, tb[IFLA_PROP_LIST], rem) { switch (nla_type(attr)) { case IFLA_ALT_IFNAME: err = rtnl_alt_ifname(cmd, dev, attr, &changed, extack); if (err) return err; break; } } if (changed) netdev_state_change(dev); return 0; } static int rtnl_newlinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return rtnl_linkprop(RTM_NEWLINKPROP, skb, nlh, extack); } static int rtnl_dellinkprop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { return rtnl_linkprop(RTM_DELLINKPROP, skb, nlh, extack); } static u32 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { struct net *net = sock_net(skb->sk); size_t min_ifinfo_dump_size = 0; struct nlattr *tb[IFLA_MAX+1]; u32 ext_filter_mask = 0; struct net_device *dev; int hdrlen; /* Same kernel<->userspace interface hack as in rtnl_dump_ifinfo. */ hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); if (nlmsg_parse_deprecated(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); } if (!ext_filter_mask) return NLMSG_GOODSIZE; /* * traverse the list of net devices and compute the minimum * buffer size based upon the filter mask. */ rcu_read_lock(); for_each_netdev_rcu(net, dev) { min_ifinfo_dump_size = max(min_ifinfo_dump_size, if_nlmsg_size(dev, ext_filter_mask)); } rcu_read_unlock(); return nlmsg_total_size(min_ifinfo_dump_size); } static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) { int idx; int s_idx = cb->family; int type = cb->nlh->nlmsg_type - RTM_BASE; int ret = 0; if (s_idx == 0) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { struct rtnl_link __rcu **tab; struct rtnl_link *link; rtnl_dumpit_func dumpit; if (idx < s_idx || idx == PF_PACKET) continue; if (type < 0 || type >= RTM_NR_MSGTYPES) continue; tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); if (!tab) continue; link = rcu_dereference_rtnl(tab[type]); if (!link) continue; dumpit = link->dumpit; if (!dumpit) continue; if (idx > s_idx) { memset(&cb->args[0], 0, sizeof(cb->args)); cb->prev_seq = 0; cb->seq = 0; } ret = dumpit(skb, cb); if (ret) break; } cb->family = idx; return skb->len ? : ret; } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; size_t if_info_size; skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), flags); if (skb == NULL) goto errout; err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, 0, 0, change, 0, 0, event, new_nsid, new_ifindex, -1, flags); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } return skb; errout: if (err < 0) rtnl_set_sk_err(net, RTNLGRP_LINK, err); return NULL; } void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) { struct net *net = dev_net(dev); rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); } static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, new_ifindex); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL, 0); } void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, new_nsid, new_ifindex); } static int nlmsg_populate_fdb_fill(struct sk_buff *skb, struct net_device *dev, u8 *addr, u16 vid, u32 pid, u32 seq, int type, unsigned int flags, int nlflags, u16 ndm_state) { struct nlmsghdr *nlh; struct ndmsg *ndm; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ndm), nlflags); if (!nlh) return -EMSGSIZE; ndm = nlmsg_data(nlh); ndm->ndm_family = AF_BRIDGE; ndm->ndm_pad1 = 0; ndm->ndm_pad2 = 0; ndm->ndm_flags = flags; ndm->ndm_type = 0; ndm->ndm_ifindex = dev->ifindex; ndm->ndm_state = ndm_state; if (nla_put(skb, NDA_LLADDR, dev->addr_len, addr)) goto nla_put_failure; if (vid) if (nla_put(skb, NDA_VLAN, sizeof(u16), &vid)) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static inline size_t rtnl_fdb_nlmsg_size(const struct net_device *dev) { return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(dev->addr_len) + /* NDA_LLADDR */ nla_total_size(sizeof(u16)) + /* NDA_VLAN */ 0; } static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type, u16 ndm_state) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(rtnl_fdb_nlmsg_size(dev), GFP_ATOMIC); if (!skb) goto errout; err = nlmsg_populate_fdb_fill(skb, dev, addr, vid, 0, 0, type, NTF_SELF, 0, ndm_state); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC); return; errout: rtnl_set_sk_err(net, RTNLGRP_NEIGH, err); } /* * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry */ int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags) { int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) { netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (vid) { netdev_info(dev, "vlans aren't supported yet for dev_uc|mc_add()\n"); return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_add_excl(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_add_excl(dev, addr); /* Only return duplicate errors if NLM_F_EXCL is set */ if (err == -EEXIST && !(flags & NLM_F_EXCL)) err = 0; return err; } EXPORT_SYMBOL(ndo_dflt_fdb_add); static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid, struct netlink_ext_ack *extack) { u16 vid = 0; if (vlan_attr) { if (nla_len(vlan_attr) != sizeof(u16)) { NL_SET_ERR_MSG(extack, "invalid vlan attribute size"); return -EINVAL; } vid = nla_get_u16(vlan_attr); if (!vid || vid >= VLAN_VID_MASK) { NL_SET_ERR_MSG(extack, "invalid vlan id"); return -EINVAL; } } *p_vid = vid; return 0; } static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; u8 *addr; u16 vid; int err; err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } if (dev->type != ARPHRD_ETHER) { NL_SET_ERR_MSG(extack, "FDB add only supported for Ethernet devices"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; err = ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, extack); if (err) goto out; else ndm->ndm_flags &= ~NTF_MASTER; } /* Embedded bridge, macvlan, and any other device support */ if ((ndm->ndm_flags & NTF_SELF)) { if (dev->netdev_ops->ndo_fdb_add) err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags, extack); else err = ndo_dflt_fdb_add(ndm, tb, dev, addr, vid, nlh->nlmsg_flags); if (!err) { rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } out: return err; } /* * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry */ int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid) { int err = -EINVAL; /* If aging addresses are supported device will need to * implement its own handler for this. */ if (!(ndm->ndm_state & NUD_PERMANENT)) { netdev_info(dev, "default FDB implementation only supports local addresses\n"); return err; } if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr)) err = dev_uc_del(dev, addr); else if (is_multicast_ether_addr(addr)) err = dev_mc_del(dev, addr); return err; } EXPORT_SYMBOL(ndo_dflt_fdb_del); static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; __u8 *addr; int err; u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) return err; ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex == 0) { NL_SET_ERR_MSG(extack, "invalid ifindex"); return -EINVAL; } dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } if (!tb[NDA_LLADDR] || nla_len(tb[NDA_LLADDR]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "invalid address"); return -EINVAL; } if (dev->type != ARPHRD_ETHER) { NL_SET_ERR_MSG(extack, "FDB delete only supported for Ethernet devices"); return -EINVAL; } addr = nla_data(tb[NDA_LLADDR]); err = fdb_vid_parse(tb[NDA_VLAN], &vid, extack); if (err) return err; err = -EOPNOTSUPP; /* Support fdb on master device the net/bridge default case */ if ((!ndm->ndm_flags || ndm->ndm_flags & NTF_MASTER) && netif_is_bridge_port(dev)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); const struct net_device_ops *ops = br_dev->netdev_ops; if (ops->ndo_fdb_del) err = ops->ndo_fdb_del(ndm, tb, dev, addr, vid); if (err) goto out; else ndm->ndm_flags &= ~NTF_MASTER; } /* Embedded bridge, macvlan, and any other device support */ if (ndm->ndm_flags & NTF_SELF) { if (dev->netdev_ops->ndo_fdb_del) err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr, vid); else err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid); if (!err) { rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH, ndm->ndm_state); ndm->ndm_flags &= ~NTF_SELF; } } out: return err; } static int nlmsg_populate_fdb(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, int *idx, struct netdev_hw_addr_list *list) { struct netdev_hw_addr *ha; int err; u32 portid, seq; portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; list_for_each_entry(ha, &list->list, list) { if (*idx < cb->args[2]) goto skip; err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0, portid, seq, RTM_NEWNEIGH, NTF_SELF, NLM_F_MULTI, NUD_PERMANENT); if (err < 0) return err; skip: *idx += 1; } return 0; } /** * ndo_dflt_fdb_dump - default netdevice operation to dump an FDB table. * @skb: socket buffer to store message in * @cb: netlink callback * @dev: netdevice * @filter_dev: ignored * @idx: the number of FDB table entries dumped is added to *@idx * * Default netdevice operation to dump the existing unicast address list. * Returns number of addresses from list put in skb. */ int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx) { int err; if (dev->type != ARPHRD_ETHER) return -EINVAL; netif_addr_lock_bh(dev); err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->uc); if (err) goto out; err = nlmsg_populate_fdb(skb, cb, dev, idx, &dev->mc); out: netif_addr_unlock_bh(dev); return err; } EXPORT_SYMBOL(ndo_dflt_fdb_dump); static int valid_fdb_dump_strict(const struct nlmsghdr *nlh, int *br_idx, int *brport_idx, struct netlink_ext_ack *extack) { struct nlattr *tb[NDA_MAX + 1]; struct ndmsg *ndm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { NL_SET_ERR_MSG(extack, "Invalid header for fdb dump request"); return -EINVAL; } ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_flags || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, extack); if (err < 0) return err; *brport_idx = ndm->ndm_ifindex; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_IFINDEX: if (nla_len(tb[i]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid IFINDEX attribute in fdb dump request"); return -EINVAL; } *brport_idx = nla_get_u32(tb[NDA_IFINDEX]); break; case NDA_MASTER: if (nla_len(tb[i]) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid MASTER attribute in fdb dump request"); return -EINVAL; } *br_idx = nla_get_u32(tb[NDA_MASTER]); break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb dump request"); return -EINVAL; } } return 0; } static int valid_fdb_dump_legacy(const struct nlmsghdr *nlh, int *br_idx, int *brport_idx, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX+1]; int err; /* A hack to preserve kernel<->userspace interface. * Before Linux v4.12 this code accepted ndmsg since iproute2 v3.3.0. * However, ndmsg is shorter than ifinfomsg thus nlmsg_parse() bails. * So, check for ndmsg with an optional u32 attribute (not used here). * Fortunately these sizes don't conflict with the size of ifinfomsg * with an optional attribute. */ if (nlmsg_len(nlh) != sizeof(struct ndmsg) && (nlmsg_len(nlh) != sizeof(struct ndmsg) + nla_attr_size(sizeof(u32)))) { struct ifinfomsg *ifm; err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) { return -EINVAL; } else if (err == 0) { if (tb[IFLA_MASTER]) *br_idx = nla_get_u32(tb[IFLA_MASTER]); } ifm = nlmsg_data(nlh); *brport_idx = ifm->ifi_index; } return 0; } static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct net_device *dev; struct net_device *br_dev = NULL; const struct net_device_ops *ops = NULL; const struct net_device_ops *cops = NULL; struct net *net = sock_net(skb->sk); struct hlist_head *head; int brport_idx = 0; int br_idx = 0; int h, s_h; int idx = 0, s_idx; int err = 0; int fidx = 0; if (cb->strict_check) err = valid_fdb_dump_strict(cb->nlh, &br_idx, &brport_idx, cb->extack); else err = valid_fdb_dump_legacy(cb->nlh, &br_idx, &brport_idx, cb->extack); if (err < 0) return err; if (br_idx) { br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) return -ENODEV; ops = br_dev->netdev_ops; } s_h = cb->args[0]; s_idx = cb->args[1]; for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { if (brport_idx && (dev->ifindex != brport_idx)) continue; if (!br_idx) { /* user did not specify a specific bridge */ if (netif_is_bridge_port(dev)) { br_dev = netdev_master_upper_dev_get(dev); cops = br_dev->netdev_ops; } } else { if (dev != br_dev && !netif_is_bridge_port(dev)) continue; if (br_dev != netdev_master_upper_dev_get(dev) && !(dev->priv_flags & IFF_EBRIDGE)) continue; cops = ops; } if (idx < s_idx) goto cont; if (netif_is_bridge_port(dev)) { if (cops && cops->ndo_fdb_dump) { err = cops->ndo_fdb_dump(skb, cb, br_dev, dev, &fidx); if (err == -EMSGSIZE) goto out; } } if (dev->netdev_ops->ndo_fdb_dump) err = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL, &fidx); else err = ndo_dflt_fdb_dump(skb, cb, dev, NULL, &fidx); if (err == -EMSGSIZE) goto out; cops = NULL; /* reset fdb offset to 0 for rest of the interfaces */ cb->args[2] = 0; fidx = 0; cont: idx++; } } out: cb->args[0] = h; cb->args[1] = idx; cb->args[2] = fidx; return skb->len; } static int valid_fdb_get_strict(const struct nlmsghdr *nlh, struct nlattr **tb, u8 *ndm_flags, int *br_idx, int *brport_idx, u8 **addr, u16 *vid, struct netlink_ext_ack *extack) { struct ndmsg *ndm; int err, i; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ndm))) { NL_SET_ERR_MSG(extack, "Invalid header for fdb get request"); return -EINVAL; } ndm = nlmsg_data(nlh); if (ndm->ndm_pad1 || ndm->ndm_pad2 || ndm->ndm_state || ndm->ndm_type) { NL_SET_ERR_MSG(extack, "Invalid values in header for fdb get request"); return -EINVAL; } if (ndm->ndm_flags & ~(NTF_MASTER | NTF_SELF)) { NL_SET_ERR_MSG(extack, "Invalid flags in header for fdb get request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ndmsg), tb, NDA_MAX, nda_policy, extack); if (err < 0) return err; *ndm_flags = ndm->ndm_flags; *brport_idx = ndm->ndm_ifindex; for (i = 0; i <= NDA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case NDA_MASTER: *br_idx = nla_get_u32(tb[i]); break; case NDA_LLADDR: if (nla_len(tb[i]) != ETH_ALEN) { NL_SET_ERR_MSG(extack, "Invalid address in fdb get request"); return -EINVAL; } *addr = nla_data(tb[i]); break; case NDA_VLAN: err = fdb_vid_parse(tb[i], vid, extack); if (err) return err; break; case NDA_VNI: break; default: NL_SET_ERR_MSG(extack, "Unsupported attribute in fdb get request"); return -EINVAL; } } return 0; } static int rtnl_fdb_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net_device *dev = NULL, *br_dev = NULL; const struct net_device_ops *ops = NULL; struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NDA_MAX + 1]; struct sk_buff *skb; int brport_idx = 0; u8 ndm_flags = 0; int br_idx = 0; u8 *addr = NULL; u16 vid = 0; int err; err = valid_fdb_get_strict(nlh, tb, &ndm_flags, &br_idx, &brport_idx, &addr, &vid, extack); if (err < 0) return err; if (!addr) { NL_SET_ERR_MSG(extack, "Missing lookup address for fdb get request"); return -EINVAL; } if (brport_idx) { dev = __dev_get_by_index(net, brport_idx); if (!dev) { NL_SET_ERR_MSG(extack, "Unknown device ifindex"); return -ENODEV; } } if (br_idx) { if (dev) { NL_SET_ERR_MSG(extack, "Master and device are mutually exclusive"); return -EINVAL; } br_dev = __dev_get_by_index(net, br_idx); if (!br_dev) { NL_SET_ERR_MSG(extack, "Invalid master ifindex"); return -EINVAL; } ops = br_dev->netdev_ops; } if (dev) { if (!ndm_flags || (ndm_flags & NTF_MASTER)) { if (!netif_is_bridge_port(dev)) { NL_SET_ERR_MSG(extack, "Device is not a bridge port"); return -EINVAL; } br_dev = netdev_master_upper_dev_get(dev); if (!br_dev) { NL_SET_ERR_MSG(extack, "Master of device not found"); return -EINVAL; } ops = br_dev->netdev_ops; } else { if (!(ndm_flags & NTF_SELF)) { NL_SET_ERR_MSG(extack, "Missing NTF_SELF"); return -EINVAL; } ops = dev->netdev_ops; } } if (!br_dev && !dev) { NL_SET_ERR_MSG(extack, "No device specified"); return -ENODEV; } if (!ops || !ops->ndo_fdb_get) { NL_SET_ERR_MSG(extack, "Fdb get operation not supported by device"); return -EOPNOTSUPP; } skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (br_dev) dev = br_dev; err = ops->ndo_fdb_get(skb, tb, dev, addr, vid, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, extack); if (err) goto out; return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: kfree_skb(skb); return err; } static int brport_nla_put_flag(struct sk_buff *skb, u32 flags, u32 mask, unsigned int attrnum, unsigned int flag) { if (mask & flag) return nla_put_u8(skb, attrnum, !!(flags & flag)); return 0; } int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, u32 flags, u32 mask, int nlflags, u32 filter_mask, int (*vlan_fill)(struct sk_buff *skb, struct net_device *dev, u32 filter_mask)) { struct nlmsghdr *nlh; struct ifinfomsg *ifm; struct nlattr *br_afspec; struct nlattr *protinfo; u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; struct net_device *br_dev = netdev_master_upper_dev_get(dev); int err = 0; nlh = nlmsg_put(skb, pid, seq, RTM_NEWLINK, sizeof(*ifm), nlflags); if (nlh == NULL) return -EMSGSIZE; ifm = nlmsg_data(nlh); ifm->ifi_family = AF_BRIDGE; ifm->__ifi_pad = 0; ifm->ifi_type = dev->type; ifm->ifi_index = dev->ifindex; ifm->ifi_flags = dev_get_flags(dev); ifm->ifi_change = 0; if (nla_put_string(skb, IFLA_IFNAME, dev->name) || nla_put_u32(skb, IFLA_MTU, dev->mtu) || nla_put_u8(skb, IFLA_OPERSTATE, operstate) || (br_dev && nla_put_u32(skb, IFLA_MASTER, br_dev->ifindex)) || (dev->addr_len && nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr)) || (dev->ifindex != dev_get_iflink(dev) && nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev)))) goto nla_put_failure; br_afspec = nla_nest_start_noflag(skb, IFLA_AF_SPEC); if (!br_afspec) goto nla_put_failure; if (nla_put_u16(skb, IFLA_BRIDGE_FLAGS, BRIDGE_FLAGS_SELF)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } if (mode != BRIDGE_MODE_UNDEF) { if (nla_put_u16(skb, IFLA_BRIDGE_MODE, mode)) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } } if (vlan_fill) { err = vlan_fill(skb, dev, filter_mask); if (err) { nla_nest_cancel(skb, br_afspec); goto nla_put_failure; } } nla_nest_end(skb, br_afspec); protinfo = nla_nest_start(skb, IFLA_PROTINFO); if (!protinfo) goto nla_put_failure; if (brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_MODE, BR_HAIRPIN_MODE) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_GUARD, BR_BPDU_GUARD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_FAST_LEAVE, BR_MULTICAST_FAST_LEAVE) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_PROTECT, BR_ROOT_BLOCK) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_LEARNING, BR_LEARNING) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_LEARNING_SYNC, BR_LEARNING_SYNC) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_PROXYARP, BR_PROXYARP) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD) || brport_nla_put_flag(skb, flags, mask, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD)) { nla_nest_cancel(skb, protinfo); goto nla_put_failure; } nla_nest_end(skb, protinfo); nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return err ? err : -EMSGSIZE; } EXPORT_SYMBOL_GPL(ndo_dflt_bridge_getlink); static int valid_bridge_getlink_req(const struct nlmsghdr *nlh, bool strict_check, u32 *filter_mask, struct netlink_ext_ack *extack) { struct nlattr *tb[IFLA_MAX+1]; int err, i; if (strict_check) { struct ifinfomsg *ifm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifm))) { NL_SET_ERR_MSG(extack, "Invalid header for bridge link dump"); return -EINVAL; } ifm = nlmsg_data(nlh); if (ifm->__ifi_pad || ifm->ifi_type || ifm->ifi_flags || ifm->ifi_change || ifm->ifi_index) { NL_SET_ERR_MSG(extack, "Invalid values in header for bridge link dump request"); return -EINVAL; } err = nlmsg_parse_deprecated_strict(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); } else { err = nlmsg_parse_deprecated(nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, ifla_policy, extack); } if (err < 0) return err; /* new attributes should only be added with strict checking */ for (i = 0; i <= IFLA_MAX; ++i) { if (!tb[i]) continue; switch (i) { case IFLA_EXT_MASK: *filter_mask = nla_get_u32(tb[i]); break; default: if (strict_check) { NL_SET_ERR_MSG(extack, "Unsupported attribute in bridge link dump request"); return -EINVAL; } } } return 0; } static int rtnl_bridge_getlink(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); struct net_device *dev; int idx = 0; u32 portid = NETLINK_CB(cb->skb).portid; u32 seq = nlh->nlmsg_seq; u32 filter_mask = 0; int err; err = valid_bridge_getlink_req(nlh, cb->strict_check, &filter_mask, cb->extack); if (err < 0 && cb->strict_check) return err; rcu_read_lock(); for_each_netdev_rcu(net, dev) { const struct net_device_ops *ops = dev->netdev_ops; struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (br_dev && br_dev->netdev_ops->ndo_bridge_getlink) { if (idx >= cb->args[0]) { err = br_dev->netdev_ops->ndo_bridge_getlink( skb, portid, seq, dev, filter_mask, NLM_F_MULTI); if (err < 0 && err != -EOPNOTSUPP) { if (likely(skb->len)) break; goto out_err; } } idx++; } if (ops->ndo_bridge_getlink) { if (idx >= cb->args[0]) { err = ops->ndo_bridge_getlink(skb, portid, seq, dev, filter_mask, NLM_F_MULTI); if (err < 0 && err != -EOPNOTSUPP) { if (likely(skb->len)) break; goto out_err; } } idx++; } } err = skb->len; out_err: rcu_read_unlock(); cb->args[0] = idx; return err; } static inline size_t bridge_nlmsg_size(void) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(sizeof(u32)) /* IFLA_MASTER */ + nla_total_size(sizeof(u32)) /* IFLA_MTU */ + nla_total_size(sizeof(u32)) /* IFLA_LINK */ + nla_total_size(sizeof(u32)) /* IFLA_OPERSTATE */ + nla_total_size(sizeof(u8)) /* IFLA_PROTINFO */ + nla_total_size(sizeof(struct nlattr)) /* IFLA_AF_SPEC */ + nla_total_size(sizeof(u16)) /* IFLA_BRIDGE_FLAGS */ + nla_total_size(sizeof(u16)); /* IFLA_BRIDGE_MODE */ } static int rtnl_bridge_notify(struct net_device *dev) { struct net *net = dev_net(dev); struct sk_buff *skb; int err = -EOPNOTSUPP; if (!dev->netdev_ops->ndo_bridge_getlink) return 0; skb = nlmsg_new(bridge_nlmsg_size(), GFP_ATOMIC); if (!skb) { err = -ENOMEM; goto errout; } err = dev->netdev_ops->ndo_bridge_getlink(skb, 0, 0, dev, 0, 0); if (err < 0) goto errout; /* Notification info is only filled for bridge ports, not the bridge * device itself. Therefore, a zero notification length is valid and * should not result in an error. */ if (!skb->len) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0; errout: WARN_ON(err == -EMSGSIZE); kfree_skb(skb); if (err) rtnl_set_sk_err(net, RTNLGRP_LINK, err); return err; } static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr, *br_flags_attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_family != AF_BRIDGE) return -EPFNOSUPPORT; dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; br_flags_attr = attr; flags = nla_get_u16(attr); } if (nla_type(attr) == IFLA_BRIDGE_MODE) { if (nla_len(attr) < sizeof(u16)) return -EINVAL; } } } if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (!br_dev || !br_dev->netdev_ops->ndo_bridge_setlink) { err = -EOPNOTSUPP; goto out; } err = br_dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, extack); if (err) goto out; flags &= ~BRIDGE_FLAGS_MASTER; } if ((flags & BRIDGE_FLAGS_SELF)) { if (!dev->netdev_ops->ndo_bridge_setlink) err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_setlink(dev, nlh, flags, extack); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; /* Generate event to notify upper layer of bridge * change */ err = rtnl_bridge_notify(dev); } } if (br_flags_attr) memcpy(nla_data(br_flags_attr), &flags, sizeof(flags)); out: return err; } static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; struct net_device *dev; struct nlattr *br_spec, *attr = NULL; int rem, err = -EOPNOTSUPP; u16 flags = 0; bool have_flags = false; if (nlmsg_len(nlh) < sizeof(*ifm)) return -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_family != AF_BRIDGE) return -EPFNOSUPPORT; dev = __dev_get_by_index(net, ifm->ifi_index); if (!dev) { NL_SET_ERR_MSG(extack, "unknown ifindex"); return -ENODEV; } br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC); if (br_spec) { nla_for_each_nested(attr, br_spec, rem) { if (nla_type(attr) == IFLA_BRIDGE_FLAGS) { if (nla_len(attr) < sizeof(flags)) return -EINVAL; have_flags = true; flags = nla_get_u16(attr); break; } } } if (!flags || (flags & BRIDGE_FLAGS_MASTER)) { struct net_device *br_dev = netdev_master_upper_dev_get(dev); if (!br_dev || !br_dev->netdev_ops->ndo_bridge_dellink) { err = -EOPNOTSUPP; goto out; } err = br_dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (err) goto out; flags &= ~BRIDGE_FLAGS_MASTER; } if ((flags & BRIDGE_FLAGS_SELF)) { if (!dev->netdev_ops->ndo_bridge_dellink) err = -EOPNOTSUPP; else err = dev->netdev_ops->ndo_bridge_dellink(dev, nlh, flags); if (!err) { flags &= ~BRIDGE_FLAGS_SELF; /* Generate event to notify upper layer of bridge * change */ err = rtnl_bridge_notify(dev); } } if (have_flags) memcpy(nla_data(attr), &flags, sizeof(flags)); out: return err; } static bool stats_attr_valid(unsigned int mask, int attrid, int idxattr) { return (mask & IFLA_STATS_FILTER_BIT(attrid)) && (!idxattr || idxattr == attrid); } #define IFLA_OFFLOAD_XSTATS_FIRST (IFLA_OFFLOAD_XSTATS_UNSPEC + 1) static int rtnl_get_offload_stats_attr_size(int attr_id) { switch (attr_id) { case IFLA_OFFLOAD_XSTATS_CPU_HIT: return sizeof(struct rtnl_link_stats64); } return 0; } static int rtnl_get_offload_stats(struct sk_buff *skb, struct net_device *dev, int *prividx) { struct nlattr *attr = NULL; int attr_id, size; void *attr_data; int err; if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && dev->netdev_ops->ndo_get_offload_stats)) return -ENODATA; for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { if (attr_id < *prividx) continue; size = rtnl_get_offload_stats_attr_size(attr_id); if (!size) continue; if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) continue; attr = nla_reserve_64bit(skb, attr_id, size, IFLA_OFFLOAD_XSTATS_UNSPEC); if (!attr) goto nla_put_failure; attr_data = nla_data(attr); memset(attr_data, 0, size); err = dev->netdev_ops->ndo_get_offload_stats(attr_id, dev, attr_data); if (err) goto get_offload_stats_failure; } if (!attr) return -ENODATA; *prividx = 0; return 0; nla_put_failure: err = -EMSGSIZE; get_offload_stats_failure: *prividx = attr_id; return err; } static int rtnl_get_offload_stats_size(const struct net_device *dev) { int nla_size = 0; int attr_id; int size; if (!(dev->netdev_ops && dev->netdev_ops->ndo_has_offload_stats && dev->netdev_ops->ndo_get_offload_stats)) return 0; for (attr_id = IFLA_OFFLOAD_XSTATS_FIRST; attr_id <= IFLA_OFFLOAD_XSTATS_MAX; attr_id++) { if (!dev->netdev_ops->ndo_has_offload_stats(dev, attr_id)) continue; size = rtnl_get_offload_stats_attr_size(attr_id); nla_size += nla_total_size_64bit(size); } if (nla_size != 0) nla_size += nla_total_size(0); return nla_size; } static int rtnl_fill_statsinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, unsigned int flags, unsigned int filter_mask, int *idxattr, int *prividx) { struct if_stats_msg *ifsm; struct nlmsghdr *nlh; struct nlattr *attr; int s_prividx = *prividx; int err; ASSERT_RTNL(); nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifsm), flags); if (!nlh) return -EMSGSIZE; ifsm = nlmsg_data(nlh); ifsm->family = PF_UNSPEC; ifsm->pad1 = 0; ifsm->pad2 = 0; ifsm->ifindex = dev->ifindex; ifsm->filter_mask = filter_mask; if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, *idxattr)) { struct rtnl_link_stats64 *sp; attr = nla_reserve_64bit(skb, IFLA_STATS_LINK_64, sizeof(struct rtnl_link_stats64), IFLA_STATS_UNSPEC); if (!attr) goto nla_put_failure; sp = nla_data(attr); dev_get_stats(dev, sp); } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, *idxattr)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS); if (!attr) goto nla_put_failure; err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; *idxattr = 0; } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, *idxattr)) { const struct rtnl_link_ops *ops = NULL; const struct net_device *master; master = netdev_master_upper_dev_get(dev); if (master) ops = master->rtnl_link_ops; if (ops && ops->fill_linkxstats) { *idxattr = IFLA_STATS_LINK_XSTATS_SLAVE; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_XSTATS_SLAVE); if (!attr) goto nla_put_failure; err = ops->fill_linkxstats(skb, dev, prividx, *idxattr); nla_nest_end(skb, attr); if (err) goto nla_put_failure; *idxattr = 0; } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, *idxattr)) { *idxattr = IFLA_STATS_LINK_OFFLOAD_XSTATS; attr = nla_nest_start_noflag(skb, IFLA_STATS_LINK_OFFLOAD_XSTATS); if (!attr) goto nla_put_failure; err = rtnl_get_offload_stats(skb, dev, prividx); if (err == -ENODATA) nla_nest_cancel(skb, attr); else nla_nest_end(skb, attr); if (err && err != -ENODATA) goto nla_put_failure; *idxattr = 0; } if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) { struct rtnl_af_ops *af_ops; *idxattr = IFLA_STATS_AF_SPEC; attr = nla_nest_start_noflag(skb, IFLA_STATS_AF_SPEC); if (!attr) goto nla_put_failure; rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->fill_stats_af) { struct nlattr *af; int err; af = nla_nest_start_noflag(skb, af_ops->family); if (!af) { rcu_read_unlock(); goto nla_put_failure; } err = af_ops->fill_stats_af(skb, dev); if (err == -ENODATA) { nla_nest_cancel(skb, af); } else if (err < 0) { rcu_read_unlock(); goto nla_put_failure; } nla_nest_end(skb, af); } } rcu_read_unlock(); nla_nest_end(skb, attr); *idxattr = 0; } nlmsg_end(skb, nlh); return 0; nla_put_failure: /* not a multi message or no progress mean a real error */ if (!(flags & NLM_F_MULTI) || s_prividx == *prividx) nlmsg_cancel(skb, nlh); else nlmsg_end(skb, nlh); return -EMSGSIZE; } static size_t if_nlmsg_stats_size(const struct net_device *dev, u32 filter_mask) { size_t size = NLMSG_ALIGN(sizeof(struct if_stats_msg)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_64, 0)) size += nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS, 0)) { const struct rtnl_link_ops *ops = dev->rtnl_link_ops; int attr = IFLA_STATS_LINK_XSTATS; if (ops && ops->get_linkxstats_size) { size += nla_total_size(ops->get_linkxstats_size(dev, attr)); /* for IFLA_STATS_LINK_XSTATS */ size += nla_total_size(0); } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_XSTATS_SLAVE, 0)) { struct net_device *_dev = (struct net_device *)dev; const struct rtnl_link_ops *ops = NULL; const struct net_device *master; /* netdev_master_upper_dev_get can't take const */ master = netdev_master_upper_dev_get(_dev); if (master) ops = master->rtnl_link_ops; if (ops && ops->get_linkxstats_size) { int attr = IFLA_STATS_LINK_XSTATS_SLAVE; size += nla_total_size(ops->get_linkxstats_size(dev, attr)); /* for IFLA_STATS_LINK_XSTATS_SLAVE */ size += nla_total_size(0); } } if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0)) size += rtnl_get_offload_stats_size(dev); if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) { struct rtnl_af_ops *af_ops; /* for IFLA_STATS_AF_SPEC */ size += nla_total_size(0); rcu_read_lock(); list_for_each_entry_rcu(af_ops, &rtnl_af_ops, list) { if (af_ops->get_stats_af_size) { size += nla_total_size( af_ops->get_stats_af_size(dev)); /* for AF_* */ size += nla_total_size(0); } } rcu_read_unlock(); } return size; } static int rtnl_valid_stats_req(const struct nlmsghdr *nlh, bool strict_check, bool is_dump, struct netlink_ext_ack *extack) { struct if_stats_msg *ifsm; if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*ifsm))) { NL_SET_ERR_MSG(extack, "Invalid header for stats dump"); return -EINVAL; } if (!strict_check) return 0; ifsm = nlmsg_data(nlh); /* only requests using strict checks can pass data to influence * the dump. The legacy exception is filter_mask. */ if (ifsm->pad1 || ifsm->pad2 || (is_dump && ifsm->ifindex)) { NL_SET_ERR_MSG(extack, "Invalid values in header for stats dump request"); return -EINVAL; } if (nlmsg_attrlen(nlh, sizeof(*ifsm))) { NL_SET_ERR_MSG(extack, "Invalid attributes after stats header"); return -EINVAL; } if (ifsm->filter_mask >= IFLA_STATS_FILTER_BIT(IFLA_STATS_MAX + 1)) { NL_SET_ERR_MSG(extack, "Invalid stats requested through filter mask"); return -EINVAL; } return 0; } static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; int idxattr = 0, prividx = 0; struct if_stats_msg *ifsm; struct sk_buff *nskb; u32 filter_mask; int err; err = rtnl_valid_stats_req(nlh, netlink_strict_get_check(skb), false, extack); if (err) return err; ifsm = nlmsg_data(nlh); if (ifsm->ifindex > 0) dev = __dev_get_by_index(net, ifsm->ifindex); else return -EINVAL; if (!dev) return -ENODEV; filter_mask = ifsm->filter_mask; if (!filter_mask) return -EINVAL; nskb = nlmsg_new(if_nlmsg_stats_size(dev, filter_mask), GFP_KERNEL); if (!nskb) return -ENOBUFS; err = rtnl_fill_statsinfo(nskb, dev, RTM_NEWSTATS, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, filter_mask, &idxattr, &prividx); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_stats_size */ WARN_ON(err == -EMSGSIZE); kfree_skb(nskb); } else { err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid); } return err; } static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) { struct netlink_ext_ack *extack = cb->extack; int h, s_h, err, s_idx, s_idxattr, s_prividx; struct net *net = sock_net(skb->sk); unsigned int flags = NLM_F_MULTI; struct if_stats_msg *ifsm; struct hlist_head *head; struct net_device *dev; u32 filter_mask = 0; int idx = 0; s_h = cb->args[0]; s_idx = cb->args[1]; s_idxattr = cb->args[2]; s_prividx = cb->args[3]; cb->seq = net->dev_base_seq; err = rtnl_valid_stats_req(cb->nlh, cb->strict_check, true, extack); if (err) return err; ifsm = nlmsg_data(cb->nlh); filter_mask = ifsm->filter_mask; if (!filter_mask) { NL_SET_ERR_MSG(extack, "Filter mask must be set for stats dump"); return -EINVAL; } for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; hlist_for_each_entry(dev, head, index_hlist) { if (idx < s_idx) goto cont; err = rtnl_fill_statsinfo(skb, dev, RTM_NEWSTATS, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, filter_mask, &s_idxattr, &s_prividx); /* If we ran out of room on the first message, * we're in trouble */ WARN_ON((err == -EMSGSIZE) && (skb->len == 0)); if (err < 0) goto out; s_prividx = 0; s_idxattr = 0; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); cont: idx++; } } out: cb->args[3] = s_prividx; cb->args[2] = s_idxattr; cb->args[1] = idx; cb->args[0] = h; return skb->len; } /* Process one rtnetlink message. */ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct rtnl_link *link; enum rtnl_kinds kind; struct module *owner; int err = -EOPNOTSUPP; rtnl_doit_func doit; unsigned int flags; int family; int type; type = nlh->nlmsg_type; if (type > RTM_MAX) return -EOPNOTSUPP; type -= RTM_BASE; /* All the messages must have at least 1 byte length */ if (nlmsg_len(nlh) < sizeof(struct rtgenmsg)) return 0; family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; kind = type&3; if (kind != RTNL_KIND_GET && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; rcu_read_lock(); if (kind == RTNL_KIND_GET && (nlh->nlmsg_flags & NLM_F_DUMP)) { struct sock *rtnl; rtnl_dumpit_func dumpit; u32 min_dump_alloc = 0; link = rtnl_get_link(family, type); if (!link || !link->dumpit) { family = PF_UNSPEC; link = rtnl_get_link(family, type); if (!link || !link->dumpit) goto err_unlock; } owner = link->owner; dumpit = link->dumpit; if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); err = 0; /* need to do this before rcu_read_unlock() */ if (!try_module_get(owner)) err = -EPROTONOSUPPORT; rcu_read_unlock(); rtnl = net->rtnl; if (err == 0) { struct netlink_dump_control c = { .dump = dumpit, .min_dump_alloc = min_dump_alloc, .module = owner, }; err = netlink_dump_start(rtnl, skb, nlh, &c); /* netlink_dump_start() will keep a reference on * module if dump is still in progress. */ module_put(owner); } return err; } link = rtnl_get_link(family, type); if (!link || !link->doit) { family = PF_UNSPEC; link = rtnl_get_link(PF_UNSPEC, type); if (!link || !link->doit) goto out_unlock; } owner = link->owner; if (!try_module_get(owner)) { err = -EPROTONOSUPPORT; goto out_unlock; } flags = link->flags; if (flags & RTNL_FLAG_DOIT_UNLOCKED) { doit = link->doit; rcu_read_unlock(); if (doit) err = doit(skb, nlh, extack); module_put(owner); return err; } rcu_read_unlock(); rtnl_lock(); link = rtnl_get_link(family, type); if (link && link->doit) err = link->doit(skb, nlh, extack); rtnl_unlock(); module_put(owner); return err; out_unlock: rcu_read_unlock(); return err; err_unlock: rcu_read_unlock(); return -EOPNOTSUPP; } static void rtnetlink_rcv(struct sk_buff *skb) { netlink_rcv_skb(skb, &rtnetlink_rcv_msg); } static int rtnetlink_bind(struct net *net, int group) { switch (group) { case RTNLGRP_IPV4_MROUTE_R: case RTNLGRP_IPV6_MROUTE_R: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; break; } return 0; } static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REBOOT: case NETDEV_CHANGEMTU: case NETDEV_CHANGEADDR: case NETDEV_CHANGENAME: case NETDEV_FEAT_CHANGE: case NETDEV_BONDING_FAILOVER: case NETDEV_POST_TYPE_CHANGE: case NETDEV_NOTIFY_PEERS: case NETDEV_CHANGEUPPER: case NETDEV_RESEND_IGMP: case NETDEV_CHANGEINFODATA: case NETDEV_CHANGELOWERSTATE: case NETDEV_CHANGE_TX_QUEUE_LEN: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), GFP_KERNEL, NULL, 0); break; default: break; } return NOTIFY_DONE; } static struct notifier_block rtnetlink_dev_notifier = { .notifier_call = rtnetlink_event, }; static int __net_init rtnetlink_net_init(struct net *net) { struct sock *sk; struct netlink_kernel_cfg cfg = { .groups = RTNLGRP_MAX, .input = rtnetlink_rcv, .cb_mutex = &rtnl_mutex, .flags = NL_CFG_F_NONROOT_RECV, .bind = rtnetlink_bind, }; sk = netlink_kernel_create(net, NETLINK_ROUTE, &cfg); if (!sk) return -ENOMEM; net->rtnl = sk; return 0; } static void __net_exit rtnetlink_net_exit(struct net *net) { netlink_kernel_release(net->rtnl); net->rtnl = NULL; } static struct pernet_operations rtnetlink_net_ops = { .init = rtnetlink_net_init, .exit = rtnetlink_net_exit, }; void __init rtnetlink_init(void) { if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); register_netdevice_notifier(&rtnetlink_dev_notifier); rtnl_register(PF_UNSPEC, RTM_GETLINK, rtnl_getlink, rtnl_dump_ifinfo, 0); rtnl_register(PF_UNSPEC, RTM_SETLINK, rtnl_setlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_NEWLINK, rtnl_newlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELLINK, rtnl_dellink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, 0); rtnl_register(PF_UNSPEC, RTM_NEWLINKPROP, rtnl_newlinkprop, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELLINKPROP, rtnl_dellinkprop, NULL, 0); rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, 0); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, 0); rtnl_register(PF_BRIDGE, RTM_GETNEIGH, rtnl_fdb_get, rtnl_fdb_dump, 0); rtnl_register(PF_BRIDGE, RTM_GETLINK, NULL, rtnl_bridge_getlink, 0); rtnl_register(PF_BRIDGE, RTM_DELLINK, rtnl_bridge_dellink, NULL, 0); rtnl_register(PF_BRIDGE, RTM_SETLINK, rtnl_bridge_setlink, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETSTATS, rtnl_stats_get, rtnl_stats_dump, 0); }
30 4 12 5 5 5 12 12 12 10 8 2 1 10 34 34 34 13 13 8 6 5 5 5 5 1 3 2 2 2 2 2 3 2 2 2 70 69 1 2 1 2 70 70 2 67 67 1 66 1 46 21 67 8 9 15 14 2 2 1 1 1 2 2 23 138 138 10 59 70 1 59 54 52 8 3 3 10 2 8 2 4 2 1 5 1 1 1 10 2 44 44 17 19 7 4 23 22 8 3 1 1 2 1 51 4 23 22 5 74 69 3 3 2 13 2 57 57 57 57 96 105 1 1 1 1 1 1 100 1 1 2 2 47 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 // SPDX-License-Identifier: GPL-2.0-or-later /* * IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT) * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Changes: * Roger Venning <r.venning@telstra.com>: 6to4 support * Nate Thompson <nate@thebog.net>: 6to4 support * Fred Templin <fred.l.templin@boeing.com>: isatap support */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmp.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/init.h> #include <linux/netfilter_ipv4.h> #include <linux/if_ether.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/protocol.h> #include <net/transp_v6.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/ip.h> #include <net/udp.h> #include <net/icmp.h> #include <net/ip_tunnels.h> #include <net/inet_ecn.h> #include <net/xfrm.h> #include <net/dsfield.h> #include <net/net_namespace.h> #include <net/netns/generic.h> /* This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c For comments look at net/ipv4/ip_gre.c --ANK */ #define IP6_SIT_HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) static bool log_ecn_error = true; module_param(log_ecn_error, bool, 0644); MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN"); static int ipip6_tunnel_init(struct net_device *dev); static void ipip6_tunnel_setup(struct net_device *dev); static void ipip6_dev_free(struct net_device *dev); static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, __be32 *v4dst); static struct rtnl_link_ops sit_link_ops __read_mostly; static unsigned int sit_net_id __read_mostly; struct sit_net { struct ip_tunnel __rcu *tunnels_r_l[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_r[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_l[IP6_SIT_HASH_SIZE]; struct ip_tunnel __rcu *tunnels_wc[1]; struct ip_tunnel __rcu **tunnels[4]; struct net_device *fb_tunnel_dev; }; static inline struct sit_net *dev_to_sit_net(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); return net_generic(t->net, sit_net_id); } /* * Must be invoked with rcu_read_lock */ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net, struct net_device *dev, __be32 remote, __be32 local, int sifindex) { unsigned int h0 = HASH(remote); unsigned int h1 = HASH(local); struct ip_tunnel *t; struct sit_net *sitn = net_generic(net, sit_net_id); int ifindex = dev ? dev->ifindex : 0; for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && (!dev || !t->parms.link || ifindex == t->parms.link || sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_r[h0]) { if (remote == t->parms.iph.daddr && (!dev || !t->parms.link || ifindex == t->parms.link || sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } for_each_ip_tunnel_rcu(t, sitn->tunnels_l[h1]) { if (local == t->parms.iph.saddr && (!dev || !t->parms.link || ifindex == t->parms.link || sifindex == t->parms.link) && (t->dev->flags & IFF_UP)) return t; } t = rcu_dereference(sitn->tunnels_wc[0]); if (t && (t->dev->flags & IFF_UP)) return t; return NULL; } static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; unsigned int h = 0; int prio = 0; if (remote) { prio |= 2; h ^= HASH(remote); } if (local) { prio |= 1; h ^= HASH(local); } return &sitn->tunnels[prio][h]; } static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn, struct ip_tunnel *t) { return __ipip6_bucket(sitn, &t->parms); } static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t) { struct ip_tunnel __rcu **tp; struct ip_tunnel *iter; for (tp = ipip6_bucket(sitn, t); (iter = rtnl_dereference(*tp)) != NULL; tp = &iter->next) { if (t == iter) { rcu_assign_pointer(*tp, t->next); break; } } } static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t) { struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t); rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) { #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel *t = netdev_priv(dev); if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) { ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0); t->ip6rd.relay_prefix = 0; t->ip6rd.prefixlen = 16; t->ip6rd.relay_prefixlen = 0; } else { struct ip_tunnel *t0 = netdev_priv(sitn->fb_tunnel_dev); memcpy(&t->ip6rd, &t0->ip6rd, sizeof(t->ip6rd)); } #endif } static int ipip6_tunnel_create(struct net_device *dev) { struct ip_tunnel *t = netdev_priv(dev); struct net *net = dev_net(dev); struct sit_net *sitn = net_generic(net, sit_net_id); int err; memcpy(dev->dev_addr, &t->parms.iph.saddr, 4); memcpy(dev->broadcast, &t->parms.iph.daddr, 4); if ((__force u16)t->parms.i_flags & SIT_ISATAP) dev->priv_flags |= IFF_ISATAP; dev->rtnl_link_ops = &sit_link_ops; err = register_netdevice(dev); if (err < 0) goto out; ipip6_tunnel_clone_6rd(dev, sitn); ipip6_tunnel_link(sitn, t); return 0; out: return err; } static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, struct ip_tunnel_parm *parms, int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; struct ip_tunnel *t, *nt; struct ip_tunnel __rcu **tp; struct net_device *dev; char name[IFNAMSIZ]; struct sit_net *sitn = net_generic(net, sit_net_id); for (tp = __ipip6_bucket(sitn, parms); (t = rtnl_dereference(*tp)) != NULL; tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && parms->link == t->parms.link) { if (create) return NULL; else return t; } } if (!create) goto failed; if (parms->name[0]) { if (!dev_valid_name(parms->name)) goto failed; strlcpy(name, parms->name, IFNAMSIZ); } else { strcpy(name, "sit%d"); } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ipip6_tunnel_setup); if (!dev) return NULL; dev_net_set(dev, net); nt = netdev_priv(dev); nt->parms = *parms; if (ipip6_tunnel_create(dev) < 0) goto failed_free; if (!parms->name[0]) strcpy(parms->name, dev->name); return nt; failed_free: free_netdev(dev); failed: return NULL; } #define for_each_prl_rcu(start) \ for (prl = rcu_dereference(start); \ prl; \ prl = rcu_dereference(prl->next)) static struct ip_tunnel_prl_entry * __ipip6_tunnel_locate_prl(struct ip_tunnel *t, __be32 addr) { struct ip_tunnel_prl_entry *prl; for_each_prl_rcu(t->prl) if (prl->addr == addr) break; return prl; } static int ipip6_tunnel_get_prl(struct net_device *dev, struct ip_tunnel_prl __user *a) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl kprl, *kp; struct ip_tunnel_prl_entry *prl; unsigned int cmax, c = 0, ca, len; int ret = 0; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) return -EINVAL; if (copy_from_user(&kprl, a, sizeof(kprl))) return -EFAULT; cmax = kprl.datalen / sizeof(kprl); if (cmax > 1 && kprl.addr != htonl(INADDR_ANY)) cmax = 1; /* For simple GET or for root users, * we try harder to allocate. */ kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ? kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) : NULL; ca = min(t->prl_count, cmax); if (!kp) { /* We don't try hard to allocate much memory for * non-root users. * For root users, retry allocating enough memory for * the answer. */ kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT | __GFP_NOWARN); if (!kp) { ret = -ENOMEM; goto out; } } rcu_read_lock(); for_each_prl_rcu(t->prl) { if (c >= cmax) break; if (kprl.addr != htonl(INADDR_ANY) && prl->addr != kprl.addr) continue; kp[c].addr = prl->addr; kp[c].flags = prl->flags; c++; if (kprl.addr != htonl(INADDR_ANY)) break; } rcu_read_unlock(); len = sizeof(*kp) * c; ret = 0; if ((len && copy_to_user(a + 1, kp, len)) || put_user(len, &a->datalen)) ret = -EFAULT; kfree(kp); out: return ret; } static int ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg) { struct ip_tunnel_prl_entry *p; int err = 0; if (a->addr == htonl(INADDR_ANY)) return -EINVAL; ASSERT_RTNL(); for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) { if (p->addr == a->addr) { if (chg) { p->flags = a->flags; goto out; } err = -EEXIST; goto out; } } if (chg) { err = -ENXIO; goto out; } p = kzalloc(sizeof(struct ip_tunnel_prl_entry), GFP_KERNEL); if (!p) { err = -ENOBUFS; goto out; } p->next = t->prl; p->addr = a->addr; p->flags = a->flags; t->prl_count++; rcu_assign_pointer(t->prl, p); out: return err; } static void prl_list_destroy_rcu(struct rcu_head *head) { struct ip_tunnel_prl_entry *p, *n; p = container_of(head, struct ip_tunnel_prl_entry, rcu_head); do { n = rcu_dereference_protected(p->next, 1); kfree(p); p = n; } while (p); } static int ipip6_tunnel_del_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a) { struct ip_tunnel_prl_entry *x; struct ip_tunnel_prl_entry __rcu **p; int err = 0; ASSERT_RTNL(); if (a && a->addr != htonl(INADDR_ANY)) { for (p = &t->prl; (x = rtnl_dereference(*p)) != NULL; p = &x->next) { if (x->addr == a->addr) { *p = x->next; kfree_rcu(x, rcu_head); t->prl_count--; goto out; } } err = -ENXIO; } else { x = rtnl_dereference(t->prl); if (x) { t->prl_count = 0; call_rcu(&x->rcu_head, prl_list_destroy_rcu); t->prl = NULL; } } out: return err; } static int ipip6_tunnel_prl_ctl(struct net_device *dev, struct ip_tunnel_prl __user *data, int cmd) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_prl prl; int err; if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) return -EINVAL; if (copy_from_user(&prl, data, sizeof(prl))) return -EFAULT; switch (cmd) { case SIOCDELPRL: err = ipip6_tunnel_del_prl(t, &prl); break; case SIOCADDPRL: case SIOCCHGPRL: err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL); break; } dst_cache_reset(&t->dst_cache); netdev_state_change(dev); return err; } static int isatap_chksrc(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *t) { struct ip_tunnel_prl_entry *p; int ok = 1; rcu_read_lock(); p = __ipip6_tunnel_locate_prl(t, iph->saddr); if (p) { if (p->flags & PRL_DEFAULT) skb->ndisc_nodetype = NDISC_NODETYPE_DEFAULT; else skb->ndisc_nodetype = NDISC_NODETYPE_NODEFAULT; } else { const struct in6_addr *addr6 = &ipv6_hdr(skb)->saddr; if (ipv6_addr_is_isatap(addr6) && (addr6->s6_addr32[3] == iph->saddr) && ipv6_chk_prefix(addr6, t->dev)) skb->ndisc_nodetype = NDISC_NODETYPE_HOST; else ok = 0; } rcu_read_unlock(); return ok; } static void ipip6_tunnel_uninit(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct sit_net *sitn = net_generic(tunnel->net, sit_net_id); if (dev == sitn->fb_tunnel_dev) { RCU_INIT_POINTER(sitn->tunnels_wc[0], NULL); } else { ipip6_tunnel_unlink(sitn, tunnel); ipip6_tunnel_del_prl(tunnel, NULL); } dst_cache_reset(&tunnel->dst_cache); dev_put(dev); } static int ipip6_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; unsigned int data_len = 0; struct ip_tunnel *t; int sifindex; int err; switch (type) { default: case ICMP_PARAMETERPROB: return 0; case ICMP_DEST_UNREACH: switch (code) { case ICMP_SR_FAILED: /* Impossible event. */ return 0; default: /* All others are translated to HOST_UNREACH. rfc2003 contains "deep thoughts" about NET_UNREACH, I believe they are just ether pollution. --ANK */ break; } break; case ICMP_TIME_EXCEEDED: if (code != ICMP_EXC_TTL) return 0; data_len = icmp_hdr(skb)->un.reserved[1] * 4; /* RFC 4884 4.1 */ break; case ICMP_REDIRECT: break; } err = -ENOENT; sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; t = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->daddr, iph->saddr, sifindex); if (!t) goto out; if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { ipv4_update_pmtu(skb, dev_net(skb->dev), info, t->parms.link, iph->protocol); err = 0; goto out; } if (type == ICMP_REDIRECT) { ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, iph->protocol); err = 0; goto out; } err = 0; if (__in6_dev_get(skb->dev) && !ip6_err_gen_icmpv6_unreach(skb, iph->ihl * 4, type, data_len)) goto out; if (t->parms.iph.daddr == 0) goto out; if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) goto out; if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO)) t->err_count++; else t->err_count = 1; t->err_time = jiffies; out: return err; } static inline bool is_spoofed_6rd(struct ip_tunnel *tunnel, const __be32 v4addr, const struct in6_addr *v6addr) { __be32 v4embed = 0; if (check_6rd(tunnel, v6addr, &v4embed) && v4addr != v4embed) return true; return false; } /* Checks if an address matches an address on the tunnel interface. * Used to detect the NAT of proto 41 packets and let them pass spoofing test. * Long story: * This function is called after we considered the packet as spoofed * in is_spoofed_6rd. * We may have a router that is doing NAT for proto 41 packets * for an internal station. Destination a.a.a.a/PREFIX:bbbb:bbbb * will be translated to n.n.n.n/PREFIX:bbbb:bbbb. And is_spoofed_6rd * function will return true, dropping the packet. * But, we can still check if is spoofed against the IP * addresses associated with the interface. */ static bool only_dnatted(const struct ip_tunnel *tunnel, const struct in6_addr *v6dst) { int prefix_len; #ifdef CONFIG_IPV6_SIT_6RD prefix_len = tunnel->ip6rd.prefixlen + 32 - tunnel->ip6rd.relay_prefixlen; #else prefix_len = 48; #endif return ipv6_chk_custom_prefix(v6dst, prefix_len, tunnel->dev); } /* Returns true if a packet is spoofed */ static bool packet_is_spoofed(struct sk_buff *skb, const struct iphdr *iph, struct ip_tunnel *tunnel) { const struct ipv6hdr *ipv6h; if (tunnel->dev->priv_flags & IFF_ISATAP) { if (!isatap_chksrc(skb, iph, tunnel)) return true; return false; } if (tunnel->dev->flags & IFF_POINTOPOINT) return false; ipv6h = ipv6_hdr(skb); if (unlikely(is_spoofed_6rd(tunnel, iph->saddr, &ipv6h->saddr))) { net_warn_ratelimited("Src spoofed %pI4/%pI6c -> %pI4/%pI6c\n", &iph->saddr, &ipv6h->saddr, &iph->daddr, &ipv6h->daddr); return true; } if (likely(!is_spoofed_6rd(tunnel, iph->daddr, &ipv6h->daddr))) return false; if (only_dnatted(tunnel, &ipv6h->daddr)) return false; net_warn_ratelimited("Dst spoofed %pI4/%pI6c -> %pI4/%pI6c\n", &iph->saddr, &ipv6h->saddr, &iph->daddr, &ipv6h->daddr); return true; } static int ipip6_rcv(struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); struct ip_tunnel *tunnel; int sifindex; int err; sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr, sifindex); if (tunnel) { struct pcpu_sw_netstats *tstats; if (tunnel->parms.iph.protocol != IPPROTO_IPV6 && tunnel->parms.iph.protocol != 0) goto out; skb->mac_header = skb->network_header; skb_reset_network_header(skb); IPCB(skb)->flags = 0; skb->dev = tunnel->dev; if (packet_is_spoofed(skb, iph, tunnel)) { DEV_STATS_INC(tunnel->dev, rx_errors); goto out; } if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6), !net_eq(tunnel->net, dev_net(tunnel->dev)))) goto out; /* skb can be uncloned in iptunnel_pull_header, so * old iph is no longer valid */ iph = (const struct iphdr *)skb_mac_header(skb); skb_reset_mac_header(skb); err = IP_ECN_decapsulate(iph, skb); if (unlikely(err)) { if (log_ecn_error) net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n", &iph->saddr, iph->tos); if (err > 1) { DEV_STATS_INC(tunnel->dev, rx_frame_errors); DEV_STATS_INC(tunnel->dev, rx_errors); goto out; } } tstats = this_cpu_ptr(tunnel->dev->tstats); u64_stats_update_begin(&tstats->syncp); tstats->rx_packets++; tstats->rx_bytes += skb->len; u64_stats_update_end(&tstats->syncp); netif_rx(skb); return 0; } /* no tunnel matched, let upstream know, ipsec may handle it */ return 1; out: kfree_skb(skb); return 0; } static const struct tnl_ptk_info ipip_tpi = { /* no tunnel info required for ipip. */ .proto = htons(ETH_P_IP), }; #if IS_ENABLED(CONFIG_MPLS) static const struct tnl_ptk_info mplsip_tpi = { /* no tunnel info required for mplsip. */ .proto = htons(ETH_P_MPLS_UC), }; #endif static int sit_tunnel_rcv(struct sk_buff *skb, u8 ipproto) { const struct iphdr *iph; struct ip_tunnel *tunnel; int sifindex; sifindex = netif_is_l3_master(skb->dev) ? IPCB(skb)->iif : 0; iph = ip_hdr(skb); tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr, sifindex); if (tunnel) { const struct tnl_ptk_info *tpi; if (tunnel->parms.iph.protocol != ipproto && tunnel->parms.iph.protocol != 0) goto drop; if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; #if IS_ENABLED(CONFIG_MPLS) if (ipproto == IPPROTO_MPLS) tpi = &mplsip_tpi; else #endif tpi = &ipip_tpi; if (iptunnel_pull_header(skb, 0, tpi->proto, false)) goto drop; skb_reset_mac_header(skb); return ip_tunnel_rcv(tunnel, skb, tpi, NULL, log_ecn_error); } return 1; drop: kfree_skb(skb); return 0; } static int ipip_rcv(struct sk_buff *skb) { return sit_tunnel_rcv(skb, IPPROTO_IPIP); } #if IS_ENABLED(CONFIG_MPLS) static int mplsip_rcv(struct sk_buff *skb) { return sit_tunnel_rcv(skb, IPPROTO_MPLS); } #endif /* * If the IPv6 address comes from 6rd / 6to4 (RFC 3056) addr space this function * stores the embedded IPv4 address in v4dst and returns true. */ static bool check_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst, __be32 *v4dst) { #ifdef CONFIG_IPV6_SIT_6RD if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix, tunnel->ip6rd.prefixlen)) { unsigned int pbw0, pbi0; int pbi1; u32 d; pbw0 = tunnel->ip6rd.prefixlen >> 5; pbi0 = tunnel->ip6rd.prefixlen & 0x1f; d = tunnel->ip6rd.relay_prefixlen < 32 ? (ntohl(v6dst->s6_addr32[pbw0]) << pbi0) >> tunnel->ip6rd.relay_prefixlen : 0; pbi1 = pbi0 - tunnel->ip6rd.relay_prefixlen; if (pbi1 > 0) d |= ntohl(v6dst->s6_addr32[pbw0 + 1]) >> (32 - pbi1); *v4dst = tunnel->ip6rd.relay_prefix | htonl(d); return true; } #else if (v6dst->s6_addr16[0] == htons(0x2002)) { /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */ memcpy(v4dst, &v6dst->s6_addr16[1], 4); return true; } #endif return false; } static inline __be32 try_6rd(struct ip_tunnel *tunnel, const struct in6_addr *v6dst) { __be32 dst = 0; check_6rd(tunnel, v6dst, &dst); return dst; } /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. */ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; const struct ipv6hdr *iph6 = ipv6_hdr(skb); u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ struct net_device *tdev; /* Device to other host */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; struct flowi4 fl4; int mtu; const struct in6_addr *addr6; int addr_type; u8 ttl; u8 protocol = IPPROTO_IPV6; int t_hlen = tunnel->hlen + sizeof(struct iphdr); if (tos == 1) tos = ipv6_get_dsfield(iph6); /* ISATAP (RFC4214) - must come before 6to4 */ if (dev->priv_flags & IFF_ISATAP) { struct neighbour *neigh = NULL; bool do_tx_error = false; if (skb_dst(skb)) neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); if (!neigh) { net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if ((addr_type & IPV6_ADDR_UNICAST) && ipv6_addr_is_isatap(addr6)) dst = addr6->s6_addr32[3]; else do_tx_error = true; neigh_release(neigh); if (do_tx_error) goto tx_error; } if (!dst) dst = try_6rd(tunnel, &iph6->daddr); if (!dst) { struct neighbour *neigh = NULL; bool do_tx_error = false; if (skb_dst(skb)) neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); if (!neigh) { net_dbg_ratelimited("nexthop == NULL\n"); goto tx_error; } addr6 = (const struct in6_addr *)&neigh->primary_key; addr_type = ipv6_addr_type(addr6); if (addr_type == IPV6_ADDR_ANY) { addr6 = &ipv6_hdr(skb)->daddr; addr_type = ipv6_addr_type(addr6); } if ((addr_type & IPV6_ADDR_COMPATv4) != 0) dst = addr6->s6_addr32[3]; else do_tx_error = true; neigh_release(neigh); if (do_tx_error) goto tx_error; } flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark, RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPV6, 0, dst, tiph->saddr, 0, 0, sock_net_uid(tunnel->net, NULL)); rt = dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr); if (!rt) { rt = ip_route_output_flow(tunnel->net, &fl4, NULL); if (IS_ERR(rt)) { DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst, fl4.saddr); } if (rt->rt_type != RTN_UNICAST) { ip_rt_put(rt); DEV_STATS_INC(dev, tx_carrier_errors); goto tx_error_icmp; } tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); DEV_STATS_INC(dev, collisions); goto tx_error; } if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) { ip_rt_put(rt); goto tx_error; } if (df) { mtu = dst_mtu(&rt->dst) - t_hlen; if (mtu < IPV4_MIN_MTU) { DEV_STATS_INC(dev, collisions); ip_rt_put(rt); goto tx_error; } if (mtu < IPV6_MIN_MTU) { mtu = IPV6_MIN_MTU; df = 0; } if (tunnel->parms.iph.daddr) skb_dst_update_pmtu_no_confirm(skb, mtu); if (skb->len > mtu && !skb_is_gso(skb)) { icmpv6_ndo_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ip_rt_put(rt); goto tx_error; } } if (tunnel->err_count > 0) { if (time_before(jiffies, tunnel->err_time + IPTUNNEL_ERR_TIMEO)) { tunnel->err_count--; dst_link_failure(skb); } else tunnel->err_count = 0; } /* * Okay, now see if we can stuff it in the buffer as-is. */ max_headroom = LL_RESERVED_SPACE(tdev) + t_hlen; if (skb_headroom(skb) < max_headroom || skb_shared(skb) || (skb_cloned(skb) && !skb_clone_writable(skb, 0))) { struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); DEV_STATS_INC(dev, tx_dropped); kfree_skb(skb); return NETDEV_TX_OK; } if (skb->sk) skb_set_owner_w(new_skb, skb->sk); dev_kfree_skb(skb); skb = new_skb; iph6 = ipv6_hdr(skb); } ttl = tiph->ttl; if (ttl == 0) ttl = iph6->hop_limit; tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) { ip_rt_put(rt); goto tx_error; } skb_set_inner_ipproto(skb, IPPROTO_IPV6); iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl, df, !net_eq(tunnel->net, dev_net(dev))); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: kfree_skb(skb); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static netdev_tx_t sit_tunnel_xmit__(struct sk_buff *skb, struct net_device *dev, u8 ipproto) { struct ip_tunnel *tunnel = netdev_priv(dev); const struct iphdr *tiph = &tunnel->parms.iph; if (iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4)) goto tx_error; skb_set_inner_ipproto(skb, ipproto); ip_tunnel_xmit(skb, dev, tiph, ipproto); return NETDEV_TX_OK; tx_error: kfree_skb(skb); DEV_STATS_INC(dev, tx_errors); return NETDEV_TX_OK; } static netdev_tx_t sit_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { if (!pskb_inet_may_pull(skb)) goto tx_err; switch (skb->protocol) { case htons(ETH_P_IP): sit_tunnel_xmit__(skb, dev, IPPROTO_IPIP); break; case htons(ETH_P_IPV6): ipip6_tunnel_xmit(skb, dev); break; #if IS_ENABLED(CONFIG_MPLS) case htons(ETH_P_MPLS_UC): sit_tunnel_xmit__(skb, dev, IPPROTO_MPLS); break; #endif default: goto tx_err; } return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static void ipip6_tunnel_bind_dev(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); struct net_device *tdev = NULL; int hlen = LL_MAX_HEADER; const struct iphdr *iph; struct flowi4 fl4; iph = &tunnel->parms.iph; if (iph->daddr) { struct rtable *rt = ip_route_output_ports(tunnel->net, &fl4, NULL, iph->daddr, iph->saddr, 0, 0, IPPROTO_IPV6, RT_TOS(iph->tos), tunnel->parms.link); if (!IS_ERR(rt)) { tdev = rt->dst.dev; ip_rt_put(rt); } dev->flags |= IFF_POINTOPOINT; } if (!tdev && tunnel->parms.link) tdev = __dev_get_by_index(tunnel->net, tunnel->parms.link); if (tdev && !netif_is_l3_master(tdev)) { int mtu; mtu = tdev->mtu - t_hlen; if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; WRITE_ONCE(dev->mtu, mtu); hlen = tdev->hard_header_len + tdev->needed_headroom; } dev->needed_headroom = t_hlen + hlen; } static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, __u32 fwmark) { struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); ipip6_tunnel_unlink(sitn, t); synchronize_net(); t->parms.iph.saddr = p->iph.saddr; t->parms.iph.daddr = p->iph.daddr; memcpy(t->dev->dev_addr, &p->iph.saddr, 4); memcpy(t->dev->broadcast, &p->iph.daddr, 4); ipip6_tunnel_link(sitn, t); t->parms.iph.ttl = p->iph.ttl; t->parms.iph.tos = p->iph.tos; t->parms.iph.frag_off = p->iph.frag_off; if (t->parms.link != p->link || t->fwmark != fwmark) { t->parms.link = p->link; t->fwmark = fwmark; ipip6_tunnel_bind_dev(t->dev); } dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); } #ifdef CONFIG_IPV6_SIT_6RD static int ipip6_tunnel_update_6rd(struct ip_tunnel *t, struct ip_tunnel_6rd *ip6rd) { struct in6_addr prefix; __be32 relay_prefix; if (ip6rd->relay_prefixlen > 32 || ip6rd->prefixlen + (32 - ip6rd->relay_prefixlen) > 64) return -EINVAL; ipv6_addr_prefix(&prefix, &ip6rd->prefix, ip6rd->prefixlen); if (!ipv6_addr_equal(&prefix, &ip6rd->prefix)) return -EINVAL; if (ip6rd->relay_prefixlen) relay_prefix = ip6rd->relay_prefix & htonl(0xffffffffUL << (32 - ip6rd->relay_prefixlen)); else relay_prefix = 0; if (relay_prefix != ip6rd->relay_prefix) return -EINVAL; t->ip6rd.prefix = prefix; t->ip6rd.relay_prefix = relay_prefix; t->ip6rd.prefixlen = ip6rd->prefixlen; t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen; dst_cache_reset(&t->dst_cache); netdev_state_change(t->dev); return 0; } static int ipip6_tunnel_get6rd(struct net_device *dev, struct ip_tunnel_parm __user *data) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_6rd ip6rd; struct ip_tunnel_parm p; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { if (copy_from_user(&p, data, sizeof(p))) return -EFAULT; t = ipip6_tunnel_locate(t->net, &p, 0); } if (!t) t = netdev_priv(dev); ip6rd.prefix = t->ip6rd.prefix; ip6rd.relay_prefix = t->ip6rd.relay_prefix; ip6rd.prefixlen = t->ip6rd.prefixlen; ip6rd.relay_prefixlen = t->ip6rd.relay_prefixlen; if (copy_to_user(data, &ip6rd, sizeof(ip6rd))) return -EFAULT; return 0; } static int ipip6_tunnel_6rdctl(struct net_device *dev, struct ip_tunnel_6rd __user *data, int cmd) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_6rd ip6rd; int err; if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (copy_from_user(&ip6rd, data, sizeof(ip6rd))) return -EFAULT; if (cmd != SIOCDEL6RD) { err = ipip6_tunnel_update_6rd(t, &ip6rd); if (err < 0) return err; } else ipip6_tunnel_clone_6rd(dev, dev_to_sit_net(dev)); return 0; } #endif /* CONFIG_IPV6_SIT_6RD */ static bool ipip6_valid_ip_proto(u8 ipproto) { return ipproto == IPPROTO_IPV6 || ipproto == IPPROTO_IPIP || #if IS_ENABLED(CONFIG_MPLS) ipproto == IPPROTO_MPLS || #endif ipproto == 0; } static int __ipip6_tunnel_ioctl_validate(struct net *net, struct ip_tunnel_parm *p) { if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (!ipip6_valid_ip_proto(p->iph.protocol)) return -EINVAL; if (p->iph.version != 4 || p->iph.ihl != 5 || (p->iph.frag_off & htons(~IP_DF))) return -EINVAL; if (p->iph.ttl) p->iph.frag_off |= htons(IP_DF); return 0; } static int ipip6_tunnel_get(struct net_device *dev, struct ip_tunnel_parm *p) { struct ip_tunnel *t = netdev_priv(dev); if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) t = ipip6_tunnel_locate(t->net, p, 0); if (!t) t = netdev_priv(dev); memcpy(p, &t->parms, sizeof(*p)); return 0; } static int ipip6_tunnel_add(struct net_device *dev, struct ip_tunnel_parm *p) { struct ip_tunnel *t = netdev_priv(dev); int err; err = __ipip6_tunnel_ioctl_validate(t->net, p); if (err) return err; t = ipip6_tunnel_locate(t->net, p, 1); if (!t) return -ENOBUFS; return 0; } static int ipip6_tunnel_change(struct net_device *dev, struct ip_tunnel_parm *p) { struct ip_tunnel *t = netdev_priv(dev); int err; err = __ipip6_tunnel_ioctl_validate(t->net, p); if (err) return err; t = ipip6_tunnel_locate(t->net, p, 0); if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { if (!t) return -ENOENT; } else { if (t) { if (t->dev != dev) return -EEXIST; } else { if (((dev->flags & IFF_POINTOPOINT) && !p->iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p->iph.daddr)) return -EINVAL; t = netdev_priv(dev); } ipip6_tunnel_update(t, p, t->fwmark); } return 0; } static int ipip6_tunnel_del(struct net_device *dev, struct ip_tunnel_parm *p) { struct ip_tunnel *t = netdev_priv(dev); if (!ns_capable(t->net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (dev == dev_to_sit_net(dev)->fb_tunnel_dev) { t = ipip6_tunnel_locate(t->net, p, 0); if (!t) return -ENOENT; if (t == netdev_priv(dev_to_sit_net(dev)->fb_tunnel_dev)) return -EPERM; dev = t->dev; } unregister_netdevice(dev); return 0; } static int ipip6_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) { switch (cmd) { case SIOCGETTUNNEL: return ipip6_tunnel_get(dev, p); case SIOCADDTUNNEL: return ipip6_tunnel_add(dev, p); case SIOCCHGTUNNEL: return ipip6_tunnel_change(dev, p); case SIOCDELTUNNEL: return ipip6_tunnel_del(dev, p); default: return -EINVAL; } } static int ipip6_tunnel_siocdevprivate(struct net_device *dev, struct ifreq *ifr, void __user *data, int cmd) { switch (cmd) { case SIOCGETTUNNEL: case SIOCADDTUNNEL: case SIOCCHGTUNNEL: case SIOCDELTUNNEL: return ip_tunnel_siocdevprivate(dev, ifr, data, cmd); case SIOCGETPRL: return ipip6_tunnel_get_prl(dev, data); case SIOCADDPRL: case SIOCDELPRL: case SIOCCHGPRL: return ipip6_tunnel_prl_ctl(dev, data, cmd); #ifdef CONFIG_IPV6_SIT_6RD case SIOCGET6RD: return ipip6_tunnel_get6rd(dev, data); case SIOCADD6RD: case SIOCCHG6RD: case SIOCDEL6RD: return ipip6_tunnel_6rdctl(dev, data, cmd); #endif default: return -EINVAL; } } static const struct net_device_ops ipip6_netdev_ops = { .ndo_init = ipip6_tunnel_init, .ndo_uninit = ipip6_tunnel_uninit, .ndo_start_xmit = sit_tunnel_xmit, .ndo_siocdevprivate = ipip6_tunnel_siocdevprivate, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip6_tunnel_ctl, }; static void ipip6_dev_free(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); dst_cache_destroy(&tunnel->dst_cache); free_percpu(dev->tstats); } #define SIT_FEATURES (NETIF_F_SG | \ NETIF_F_FRAGLIST | \ NETIF_F_HIGHDMA | \ NETIF_F_GSO_SOFTWARE | \ NETIF_F_HW_CSUM) static void ipip6_tunnel_setup(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); int t_hlen = tunnel->hlen + sizeof(struct iphdr); dev->netdev_ops = &ipip6_netdev_ops; dev->header_ops = &ip_tunnel_header_ops; dev->needs_free_netdev = true; dev->priv_destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; dev->mtu = ETH_DATA_LEN - t_hlen; dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - t_hlen; dev->flags = IFF_NOARP; netif_keep_dst(dev); dev->addr_len = 4; dev->features |= NETIF_F_LLTX; dev->features |= SIT_FEATURES; dev->hw_features |= SIT_FEATURES; } static int ipip6_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); int err; tunnel->dev = dev; tunnel->net = dev_net(dev); strcpy(tunnel->parms.name, dev->name); ipip6_tunnel_bind_dev(dev); dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (err) { free_percpu(dev->tstats); dev->tstats = NULL; return err; } dev_hold(dev); return 0; } static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; struct net *net = dev_net(dev); struct sit_net *sitn = net_generic(net, sit_net_id); iph->version = 4; iph->protocol = IPPROTO_IPV6; iph->ihl = 5; iph->ttl = 64; rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); } static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u8 proto; if (!data || !data[IFLA_IPTUN_PROTO]) return 0; proto = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (!ipip6_valid_ip_proto(proto)) return -EINVAL; return 0; } static void ipip6_netlink_parms(struct nlattr *data[], struct ip_tunnel_parm *parms, __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); parms->iph.version = 4; parms->iph.protocol = IPPROTO_IPV6; parms->iph.ihl = 5; parms->iph.ttl = 64; if (!data) return; if (data[IFLA_IPTUN_LINK]) parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); if (data[IFLA_IPTUN_LOCAL]) parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); if (data[IFLA_IPTUN_REMOTE]) parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); if (data[IFLA_IPTUN_TTL]) { parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); if (parms->iph.ttl) parms->iph.frag_off = htons(IP_DF); } if (data[IFLA_IPTUN_TOS]) parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) parms->iph.frag_off = htons(IP_DF); if (data[IFLA_IPTUN_FLAGS]) parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); if (data[IFLA_IPTUN_PROTO]) parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); if (data[IFLA_IPTUN_FWMARK]) *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } /* This function returns true when ENCAP attributes are present in the nl msg */ static bool ipip6_netlink_encap_parms(struct nlattr *data[], struct ip_tunnel_encap *ipencap) { bool ret = false; memset(ipencap, 0, sizeof(*ipencap)); if (!data) return ret; if (data[IFLA_IPTUN_ENCAP_TYPE]) { ret = true; ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); } if (data[IFLA_IPTUN_ENCAP_FLAGS]) { ret = true; ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); } if (data[IFLA_IPTUN_ENCAP_SPORT]) { ret = true; ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]); } if (data[IFLA_IPTUN_ENCAP_DPORT]) { ret = true; ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]); } return ret; } #ifdef CONFIG_IPV6_SIT_6RD /* This function returns true when 6RD attributes are present in the nl msg */ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], struct ip_tunnel_6rd *ip6rd) { bool ret = false; memset(ip6rd, 0, sizeof(*ip6rd)); if (!data) return ret; if (data[IFLA_IPTUN_6RD_PREFIX]) { ret = true; ip6rd->prefix = nla_get_in6_addr(data[IFLA_IPTUN_6RD_PREFIX]); } if (data[IFLA_IPTUN_6RD_RELAY_PREFIX]) { ret = true; ip6rd->relay_prefix = nla_get_be32(data[IFLA_IPTUN_6RD_RELAY_PREFIX]); } if (data[IFLA_IPTUN_6RD_PREFIXLEN]) { ret = true; ip6rd->prefixlen = nla_get_u16(data[IFLA_IPTUN_6RD_PREFIXLEN]); } if (data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]) { ret = true; ip6rd->relay_prefixlen = nla_get_u16(data[IFLA_IPTUN_6RD_RELAY_PREFIXLEN]); } return ret; } #endif static int ipip6_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct net *net = dev_net(dev); struct ip_tunnel *nt; struct ip_tunnel_encap ipencap; #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif int err; nt = netdev_priv(dev); if (ipip6_netlink_encap_parms(data, &ipencap)) { err = ip_tunnel_encap_setup(nt, &ipencap); if (err < 0) return err; } ipip6_netlink_parms(data, &nt->parms, &nt->fwmark); if (ipip6_tunnel_locate(net, &nt->parms, 0)) return -EEXIST; err = ipip6_tunnel_create(dev); if (err < 0) return err; if (tb[IFLA_MTU]) { u32 mtu = nla_get_u32(tb[IFLA_MTU]); if (mtu >= IPV6_MIN_MTU && mtu <= IP6_MAX_MTU - dev->hard_header_len) dev->mtu = mtu; } #ifdef CONFIG_IPV6_SIT_6RD if (ipip6_netlink_6rd_parms(data, &ip6rd)) { err = ipip6_tunnel_update_6rd(nt, &ip6rd); if (err < 0) unregister_netdevice_queue(dev, NULL); } #endif return err; } static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif __u32 fwmark = t->fwmark; int err; if (dev == sitn->fb_tunnel_dev) return -EINVAL; if (ipip6_netlink_encap_parms(data, &ipencap)) { err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } ipip6_netlink_parms(data, &p, &fwmark); if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) return -EINVAL; t = ipip6_tunnel_locate(net, &p, 0); if (t) { if (t->dev != dev) return -EEXIST; } else t = netdev_priv(dev); ipip6_tunnel_update(t, &p, fwmark); #ifdef CONFIG_IPV6_SIT_6RD if (ipip6_netlink_6rd_parms(data, &ip6rd)) return ipip6_tunnel_update_6rd(t, &ip6rd); #endif return 0; } static size_t ipip6_get_size(const struct net_device *dev) { return /* IFLA_IPTUN_LINK */ nla_total_size(4) + /* IFLA_IPTUN_LOCAL */ nla_total_size(4) + /* IFLA_IPTUN_REMOTE */ nla_total_size(4) + /* IFLA_IPTUN_TTL */ nla_total_size(1) + /* IFLA_IPTUN_TOS */ nla_total_size(1) + /* IFLA_IPTUN_PMTUDISC */ nla_total_size(1) + /* IFLA_IPTUN_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_PROTO */ nla_total_size(1) + #ifdef CONFIG_IPV6_SIT_6RD /* IFLA_IPTUN_6RD_PREFIX */ nla_total_size(sizeof(struct in6_addr)) + /* IFLA_IPTUN_6RD_RELAY_PREFIX */ nla_total_size(4) + /* IFLA_IPTUN_6RD_PREFIXLEN */ nla_total_size(2) + /* IFLA_IPTUN_6RD_RELAY_PREFIXLEN */ nla_total_size(2) + #endif /* IFLA_IPTUN_ENCAP_TYPE */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_FLAGS */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_SPORT */ nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + /* IFLA_IPTUN_FWMARK */ nla_total_size(4) + 0; } static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct ip_tunnel_parm *parm = &tunnel->parms; if (nla_put_u32(skb, IFLA_IPTUN_LINK, parm->link) || nla_put_in_addr(skb, IFLA_IPTUN_LOCAL, parm->iph.saddr) || nla_put_in_addr(skb, IFLA_IPTUN_REMOTE, parm->iph.daddr) || nla_put_u8(skb, IFLA_IPTUN_TTL, parm->iph.ttl) || nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags) || nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; #ifdef CONFIG_IPV6_SIT_6RD if (nla_put_in6_addr(skb, IFLA_IPTUN_6RD_PREFIX, &tunnel->ip6rd.prefix) || nla_put_in_addr(skb, IFLA_IPTUN_6RD_RELAY_PREFIX, tunnel->ip6rd.relay_prefix) || nla_put_u16(skb, IFLA_IPTUN_6RD_PREFIXLEN, tunnel->ip6rd.prefixlen) || nla_put_u16(skb, IFLA_IPTUN_6RD_RELAY_PREFIXLEN, tunnel->ip6rd.relay_prefixlen)) goto nla_put_failure; #endif if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_SPORT, tunnel->encap.sport) || nla_put_be16(skb, IFLA_IPTUN_ENCAP_DPORT, tunnel->encap.dport) || nla_put_u16(skb, IFLA_IPTUN_ENCAP_FLAGS, tunnel->encap.flags)) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_LINK] = { .type = NLA_U32 }, [IFLA_IPTUN_LOCAL] = { .type = NLA_U32 }, [IFLA_IPTUN_REMOTE] = { .type = NLA_U32 }, [IFLA_IPTUN_TTL] = { .type = NLA_U8 }, [IFLA_IPTUN_TOS] = { .type = NLA_U8 }, [IFLA_IPTUN_PMTUDISC] = { .type = NLA_U8 }, [IFLA_IPTUN_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_PROTO] = { .type = NLA_U8 }, #ifdef CONFIG_IPV6_SIT_6RD [IFLA_IPTUN_6RD_PREFIX] = { .len = sizeof(struct in6_addr) }, [IFLA_IPTUN_6RD_RELAY_PREFIX] = { .type = NLA_U32 }, [IFLA_IPTUN_6RD_PREFIXLEN] = { .type = NLA_U16 }, [IFLA_IPTUN_6RD_RELAY_PREFIXLEN] = { .type = NLA_U16 }, #endif [IFLA_IPTUN_ENCAP_TYPE] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static void ipip6_dellink(struct net_device *dev, struct list_head *head) { struct net *net = dev_net(dev); struct sit_net *sitn = net_generic(net, sit_net_id); if (dev != sitn->fb_tunnel_dev) unregister_netdevice_queue(dev, head); } static struct rtnl_link_ops sit_link_ops __read_mostly = { .kind = "sit", .maxtype = IFLA_IPTUN_MAX, .policy = ipip6_policy, .priv_size = sizeof(struct ip_tunnel), .setup = ipip6_tunnel_setup, .validate = ipip6_validate, .newlink = ipip6_newlink, .changelink = ipip6_changelink, .get_size = ipip6_get_size, .fill_info = ipip6_fill_info, .dellink = ipip6_dellink, .get_link_net = ip_tunnel_get_link_net, }; static struct xfrm_tunnel sit_handler __read_mostly = { .handler = ipip6_rcv, .err_handler = ipip6_err, .priority = 1, }; static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip6_err, .priority = 2, }; #if IS_ENABLED(CONFIG_MPLS) static struct xfrm_tunnel mplsip_handler __read_mostly = { .handler = mplsip_rcv, .err_handler = ipip6_err, .priority = 2, }; #endif static void __net_exit sit_destroy_tunnels(struct net *net, struct list_head *head) { struct sit_net *sitn = net_generic(net, sit_net_id); struct net_device *dev, *aux; int prio; for_each_netdev_safe(net, dev, aux) if (dev->rtnl_link_ops == &sit_link_ops) unregister_netdevice_queue(dev, head); for (prio = 0; prio < 4; prio++) { int h; for (h = 0; h < (prio ? IP6_SIT_HASH_SIZE : 1); h++) { struct ip_tunnel *t; t = rtnl_dereference(sitn->tunnels[prio][h]); while (t) { /* If dev is in the same netns, it has already * been added to the list by the previous loop. */ if (!net_eq(dev_net(t->dev), net)) unregister_netdevice_queue(t->dev, head); t = rtnl_dereference(t->next); } } } } static int __net_init sit_init_net(struct net *net) { struct sit_net *sitn = net_generic(net, sit_net_id); struct ip_tunnel *t; int err; sitn->tunnels[0] = sitn->tunnels_wc; sitn->tunnels[1] = sitn->tunnels_l; sitn->tunnels[2] = sitn->tunnels_r; sitn->tunnels[3] = sitn->tunnels_r_l; if (!net_has_fallback_tunnels(net)) return 0; sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", NET_NAME_UNKNOWN, ipip6_tunnel_setup); if (!sitn->fb_tunnel_dev) { err = -ENOMEM; goto err_alloc_dev; } dev_net_set(sitn->fb_tunnel_dev, net); sitn->fb_tunnel_dev->rtnl_link_ops = &sit_link_ops; /* FB netdevice is special: we have one, and only one per netns. * Allowing to move it to another netns is clearly unsafe. */ sitn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL; err = register_netdev(sitn->fb_tunnel_dev); if (err) goto err_reg_dev; ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); t = netdev_priv(sitn->fb_tunnel_dev); strcpy(t->parms.name, sitn->fb_tunnel_dev->name); return 0; err_reg_dev: free_netdev(sitn->fb_tunnel_dev); err_alloc_dev: return err; } static void __net_exit sit_exit_batch_net(struct list_head *net_list) { LIST_HEAD(list); struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) sit_destroy_tunnels(net, &list); unregister_netdevice_many(&list); rtnl_unlock(); } static struct pernet_operations sit_net_ops = { .init = sit_init_net, .exit_batch = sit_exit_batch_net, .id = &sit_net_id, .size = sizeof(struct sit_net), }; static void __exit sit_cleanup(void) { rtnl_link_unregister(&sit_link_ops); xfrm4_tunnel_deregister(&sit_handler, AF_INET6); xfrm4_tunnel_deregister(&ipip_handler, AF_INET); #if IS_ENABLED(CONFIG_MPLS) xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); #endif unregister_pernet_device(&sit_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } static int __init sit_init(void) { int err; pr_info("IPv6, IPv4 and MPLS over IPv4 tunneling driver\n"); err = register_pernet_device(&sit_net_ops); if (err < 0) return err; err = xfrm4_tunnel_register(&sit_handler, AF_INET6); if (err < 0) { pr_info("%s: can't register ip6ip4\n", __func__); goto xfrm_tunnel_failed; } err = xfrm4_tunnel_register(&ipip_handler, AF_INET); if (err < 0) { pr_info("%s: can't register ip4ip4\n", __func__); goto xfrm_tunnel4_failed; } #if IS_ENABLED(CONFIG_MPLS) err = xfrm4_tunnel_register(&mplsip_handler, AF_MPLS); if (err < 0) { pr_info("%s: can't register mplsip\n", __func__); goto xfrm_tunnel_mpls_failed; } #endif err = rtnl_link_register(&sit_link_ops); if (err < 0) goto rtnl_link_failed; out: return err; rtnl_link_failed: #if IS_ENABLED(CONFIG_MPLS) xfrm4_tunnel_deregister(&mplsip_handler, AF_MPLS); xfrm_tunnel_mpls_failed: #endif xfrm4_tunnel_deregister(&ipip_handler, AF_INET); xfrm_tunnel4_failed: xfrm4_tunnel_deregister(&sit_handler, AF_INET6); xfrm_tunnel_failed: unregister_pernet_device(&sit_net_ops); goto out; } module_init(sit_init); module_exit(sit_cleanup); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("sit"); MODULE_ALIAS_NETDEV("sit0");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NET Generic infrastructure for Network protocols. * * Definitions for request_sock * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * From code originally in include/net/tcp.h */ #ifndef _REQUEST_SOCK_H #define _REQUEST_SOCK_H #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/types.h> #include <linux/bug.h> #include <linux/refcount.h> #include <net/sock.h> struct request_sock; struct sk_buff; struct dst_entry; struct proto; struct request_sock_ops { int family; unsigned int obj_size; struct kmem_cache *slab; char *slab_name; int (*rtx_syn_ack)(const struct sock *sk, struct request_sock *req); void (*send_ack)(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); void (*send_reset)(const struct sock *sk, struct sk_buff *skb); void (*destructor)(struct request_sock *req); void (*syn_ack_timeout)(const struct request_sock *req); }; int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req); struct saved_syn { u32 mac_hdrlen; u32 network_hdrlen; u32 tcp_hdrlen; u8 data[]; }; /* struct request_sock - mini sock to represent a connection request */ struct request_sock { struct sock_common __req_common; #define rsk_refcnt __req_common.skc_refcnt #define rsk_hash __req_common.skc_hash #define rsk_listener __req_common.skc_listener #define rsk_window_clamp __req_common.skc_window_clamp #define rsk_rcv_wnd __req_common.skc_rcv_wnd struct request_sock *dl_next; u16 mss; u8 num_retrans; /* number of retransmits */ u8 syncookie:1; /* syncookie: encode tcpopts in timestamp */ u8 num_timeout:7; /* number of timeouts */ u32 ts_recent; struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; struct sock *sk; struct saved_syn *saved_syn; u32 secid; u32 peer_secid; u32 timeout; }; static inline struct request_sock *inet_reqsk(const struct sock *sk) { return (struct request_sock *)sk; } static inline struct sock *req_to_sk(struct request_sock *req) { return (struct sock *)req; } static inline struct request_sock * reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { struct request_sock *req; req = kmem_cache_alloc(ops->slab, GFP_ATOMIC | __GFP_NOWARN); if (!req) return NULL; req->rsk_listener = NULL; if (attach_listener) { if (unlikely(!refcount_inc_not_zero(&sk_listener->sk_refcnt))) { kmem_cache_free(ops->slab, req); return NULL; } req->rsk_listener = sk_listener; } req->rsk_ops = ops; req_to_sk(req)->sk_prot = sk_listener->sk_prot; sk_node_init(&req_to_sk(req)->sk_node); sk_tx_queue_clear(req_to_sk(req)); req->saved_syn = NULL; req->timeout = 0; req->num_timeout = 0; req->num_retrans = 0; req->sk = NULL; refcount_set(&req->rsk_refcnt, 0); return req; } static inline void __reqsk_free(struct request_sock *req) { req->rsk_ops->destructor(req); if (req->rsk_listener) sock_put(req->rsk_listener); kfree(req->saved_syn); kmem_cache_free(req->rsk_ops->slab, req); } static inline void reqsk_free(struct request_sock *req) { WARN_ON_ONCE(refcount_read(&req->rsk_refcnt) != 0); __reqsk_free(req); } static inline void reqsk_put(struct request_sock *req) { if (refcount_dec_and_test(&req->rsk_refcnt)) reqsk_free(req); } /* * For a TCP Fast Open listener - * lock - protects the access to all the reqsk, which is co-owned by * the listener and the child socket. * qlen - pending TFO requests (still in TCP_SYN_RECV). * max_qlen - max TFO reqs allowed before TFO is disabled. * * XXX (TFO) - ideally these fields can be made as part of "listen_sock" * structure above. But there is some implementation difficulty due to * listen_sock being part of request_sock_queue hence will be freed when * a listener is stopped. But TFO related fields may continue to be * accessed even after a listener is closed, until its sk_refcnt drops * to 0 implying no more outstanding TFO reqs. One solution is to keep * listen_opt around until sk_refcnt drops to 0. But there is some other * complexity that needs to be resolved. E.g., a listener can be disabled * temporarily through shutdown()->tcp_disconnect(), and re-enabled later. */ struct fastopen_queue { struct request_sock *rskq_rst_head; /* Keep track of past TFO */ struct request_sock *rskq_rst_tail; /* requests that caused RST. * This is part of the defense * against spoofing attack. */ spinlock_t lock; int qlen; /* # of pending (TCP_SYN_RECV) reqs */ int max_qlen; /* != 0 iff TFO is currently enabled */ struct tcp_fastopen_context __rcu *ctx; /* cipher context for cookie */ }; /** struct request_sock_queue - queue of request_socks * * @rskq_accept_head - FIFO head of established children * @rskq_accept_tail - FIFO tail of established children * @rskq_defer_accept - User waits for some data after accept() * */ struct request_sock_queue { spinlock_t rskq_lock; u8 rskq_defer_accept; u32 synflood_warned; atomic_t qlen; atomic_t young; struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; struct fastopen_queue fastopenq; /* Check max_qlen != 0 to determine * if TFO is enabled. */ }; void reqsk_queue_alloc(struct request_sock_queue *queue); void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, bool reset); static inline bool reqsk_queue_empty(const struct request_sock_queue *queue) { return READ_ONCE(queue->rskq_accept_head) == NULL; } static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue, struct sock *parent) { struct request_sock *req; spin_lock_bh(&queue->rskq_lock); req = queue->rskq_accept_head; if (req) { sk_acceptq_removed(parent); WRITE_ONCE(queue->rskq_accept_head, req->dl_next); if (queue->rskq_accept_head == NULL) queue->rskq_accept_tail = NULL; } spin_unlock_bh(&queue->rskq_lock); return req; } static inline void reqsk_queue_removed(struct request_sock_queue *queue, const struct request_sock *req) { if (req->num_timeout == 0) atomic_dec(&queue->young); atomic_dec(&queue->qlen); } static inline void reqsk_queue_added(struct request_sock_queue *queue) { atomic_inc(&queue->young); atomic_inc(&queue->qlen); } static inline int reqsk_queue_len(const struct request_sock_queue *queue) { return atomic_read(&queue->qlen); } static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) { return atomic_read(&queue->young); } #endif /* _REQUEST_SOCK_H */
343 343 343 343 343 343 342 342 342 343 342 343 343 343 343 328 343 343 343 343 343 343 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2008 IBM Corporation * Author: Mimi Zohar <zohar@us.ibm.com> * * ima_policy.c * - initialize default measure policy rules */ #include <linux/init.h> #include <linux/list.h> #include <linux/kernel_read_file.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/magic.h> #include <linux/parser.h> #include <linux/slab.h> #include <linux/rculist.h> #include <linux/genhd.h> #include <linux/seq_file.h> #include <linux/ima.h> #include "ima.h" /* flags definitions */ #define IMA_FUNC 0x0001 #define IMA_MASK 0x0002 #define IMA_FSMAGIC 0x0004 #define IMA_UID 0x0008 #define IMA_FOWNER 0x0010 #define IMA_FSUUID 0x0020 #define IMA_INMASK 0x0040 #define IMA_EUID 0x0080 #define IMA_PCR 0x0100 #define IMA_FSNAME 0x0200 #define IMA_KEYRINGS 0x0400 #define IMA_LABEL 0x0800 #define IMA_VALIDATE_ALGOS 0x1000 #define UNKNOWN 0 #define MEASURE 0x0001 /* same as IMA_MEASURE */ #define DONT_MEASURE 0x0002 #define APPRAISE 0x0004 /* same as IMA_APPRAISE */ #define DONT_APPRAISE 0x0008 #define AUDIT 0x0040 #define HASH 0x0100 #define DONT_HASH 0x0200 #define INVALID_PCR(a) (((a) < 0) || \ (a) >= (sizeof_field(struct integrity_iint_cache, measured_pcrs) * 8)) int ima_policy_flag; static int temp_ima_appraise; static int build_ima_appraise __ro_after_init; atomic_t ima_setxattr_allowed_hash_algorithms; #define MAX_LSM_RULES 6 enum lsm_rule_types { LSM_OBJ_USER, LSM_OBJ_ROLE, LSM_OBJ_TYPE, LSM_SUBJ_USER, LSM_SUBJ_ROLE, LSM_SUBJ_TYPE }; enum policy_types { ORIGINAL_TCB = 1, DEFAULT_TCB }; enum policy_rule_list { IMA_DEFAULT_POLICY = 1, IMA_CUSTOM_POLICY }; struct ima_rule_opt_list { size_t count; char *items[]; }; struct ima_rule_entry { struct list_head list; int action; unsigned int flags; enum ima_hooks func; int mask; unsigned long fsmagic; uuid_t fsuuid; kuid_t uid; kuid_t fowner; bool (*uid_op)(kuid_t, kuid_t); /* Handlers for operators */ bool (*fowner_op)(kuid_t, kuid_t); /* uid_eq(), uid_gt(), uid_lt() */ int pcr; unsigned int allowed_algos; /* bitfield of allowed hash algorithms */ struct { void *rule; /* LSM file metadata specific */ char *args_p; /* audit value */ int type; /* audit type */ } lsm[MAX_LSM_RULES]; char *fsname; struct ima_rule_opt_list *keyrings; /* Measure keys added to these keyrings */ struct ima_rule_opt_list *label; /* Measure data grouped under this label */ struct ima_template_desc *template; }; /* * sanity check in case the kernels gains more hash algorithms that can * fit in an unsigned int */ static_assert( 8 * sizeof(unsigned int) >= HASH_ALGO__LAST, "The bitfield allowed_algos in ima_rule_entry is too small to contain all the supported hash algorithms, consider using a bigger type"); /* * Without LSM specific knowledge, the default policy can only be * written in terms of .action, .func, .mask, .fsmagic, .uid, and .fowner */ /* * The minimum rule set to allow for full TCB coverage. Measures all files * opened or mmap for exec and everything read by root. Dangerous because * normal users can easily run the machine out of memory simply building * and running executables. */ static struct ima_rule_entry dont_measure_rules[] __ro_after_init = { {.action = DONT_MEASURE, .fsmagic = PROC_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = SYSFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = DEBUGFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = TMPFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = DEVPTS_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = BINFMTFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = SECURITYFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = SELINUX_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = SMACK_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = CGROUP_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = CGROUP2_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = NSFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_MEASURE, .fsmagic = EFIVARFS_MAGIC, .flags = IMA_FSMAGIC} }; static struct ima_rule_entry original_measurement_rules[] __ro_after_init = { {.action = MEASURE, .func = MMAP_CHECK, .mask = MAY_EXEC, .flags = IMA_FUNC | IMA_MASK}, {.action = MEASURE, .func = BPRM_CHECK, .mask = MAY_EXEC, .flags = IMA_FUNC | IMA_MASK}, {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ, .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq, .flags = IMA_FUNC | IMA_MASK | IMA_UID}, {.action = MEASURE, .func = MODULE_CHECK, .flags = IMA_FUNC}, {.action = MEASURE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC}, }; static struct ima_rule_entry default_measurement_rules[] __ro_after_init = { {.action = MEASURE, .func = MMAP_CHECK, .mask = MAY_EXEC, .flags = IMA_FUNC | IMA_MASK}, {.action = MEASURE, .func = BPRM_CHECK, .mask = MAY_EXEC, .flags = IMA_FUNC | IMA_MASK}, {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ, .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq, .flags = IMA_FUNC | IMA_INMASK | IMA_EUID}, {.action = MEASURE, .func = FILE_CHECK, .mask = MAY_READ, .uid = GLOBAL_ROOT_UID, .uid_op = &uid_eq, .flags = IMA_FUNC | IMA_INMASK | IMA_UID}, {.action = MEASURE, .func = MODULE_CHECK, .flags = IMA_FUNC}, {.action = MEASURE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC}, {.action = MEASURE, .func = POLICY_CHECK, .flags = IMA_FUNC}, }; static struct ima_rule_entry default_appraise_rules[] __ro_after_init = { {.action = DONT_APPRAISE, .fsmagic = PROC_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = SYSFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = DEBUGFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = TMPFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = RAMFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = DEVPTS_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = BINFMTFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = SECURITYFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = SELINUX_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = SMACK_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = NSFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = EFIVARFS_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = CGROUP_SUPER_MAGIC, .flags = IMA_FSMAGIC}, {.action = DONT_APPRAISE, .fsmagic = CGROUP2_SUPER_MAGIC, .flags = IMA_FSMAGIC}, #ifdef CONFIG_IMA_WRITE_POLICY {.action = APPRAISE, .func = POLICY_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, #endif #ifndef CONFIG_IMA_APPRAISE_SIGNED_INIT {.action = APPRAISE, .fowner = GLOBAL_ROOT_UID, .fowner_op = &uid_eq, .flags = IMA_FOWNER}, #else /* force signature */ {.action = APPRAISE, .fowner = GLOBAL_ROOT_UID, .fowner_op = &uid_eq, .flags = IMA_FOWNER | IMA_DIGSIG_REQUIRED}, #endif }; static struct ima_rule_entry build_appraise_rules[] __ro_after_init = { #ifdef CONFIG_IMA_APPRAISE_REQUIRE_MODULE_SIGS {.action = APPRAISE, .func = MODULE_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, #endif #ifdef CONFIG_IMA_APPRAISE_REQUIRE_FIRMWARE_SIGS {.action = APPRAISE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, #endif #ifdef CONFIG_IMA_APPRAISE_REQUIRE_KEXEC_SIGS {.action = APPRAISE, .func = KEXEC_KERNEL_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, #endif #ifdef CONFIG_IMA_APPRAISE_REQUIRE_POLICY_SIGS {.action = APPRAISE, .func = POLICY_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, #endif }; static struct ima_rule_entry secure_boot_rules[] __ro_after_init = { {.action = APPRAISE, .func = MODULE_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, {.action = APPRAISE, .func = FIRMWARE_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, {.action = APPRAISE, .func = KEXEC_KERNEL_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, {.action = APPRAISE, .func = POLICY_CHECK, .flags = IMA_FUNC | IMA_DIGSIG_REQUIRED}, }; static struct ima_rule_entry critical_data_rules[] __ro_after_init = { {.action = MEASURE, .func = CRITICAL_DATA, .flags = IMA_FUNC}, }; /* An array of architecture specific rules */ static struct ima_rule_entry *arch_policy_entry __ro_after_init; static LIST_HEAD(ima_default_rules); static LIST_HEAD(ima_policy_rules); static LIST_HEAD(ima_temp_rules); static struct list_head __rcu *ima_rules = (struct list_head __rcu *)(&ima_default_rules); static int ima_policy __initdata; static int __init default_measure_policy_setup(char *str) { if (ima_policy) return 1; ima_policy = ORIGINAL_TCB; return 1; } __setup("ima_tcb", default_measure_policy_setup); static bool ima_use_appraise_tcb __initdata; static bool ima_use_secure_boot __initdata; static bool ima_use_critical_data __initdata; static bool ima_fail_unverifiable_sigs __ro_after_init; static int __init policy_setup(char *str) { char *p; while ((p = strsep(&str, " |\n")) != NULL) { if (*p == ' ') continue; if ((strcmp(p, "tcb") == 0) && !ima_policy) ima_policy = DEFAULT_TCB; else if (strcmp(p, "appraise_tcb") == 0) ima_use_appraise_tcb = true; else if (strcmp(p, "secure_boot") == 0) ima_use_secure_boot = true; else if (strcmp(p, "critical_data") == 0) ima_use_critical_data = true; else if (strcmp(p, "fail_securely") == 0) ima_fail_unverifiable_sigs = true; else pr_err("policy \"%s\" not found", p); } return 1; } __setup("ima_policy=", policy_setup); static int __init default_appraise_policy_setup(char *str) { ima_use_appraise_tcb = true; return 1; } __setup("ima_appraise_tcb", default_appraise_policy_setup); static struct ima_rule_opt_list *ima_alloc_rule_opt_list(const substring_t *src) { struct ima_rule_opt_list *opt_list; size_t count = 0; char *src_copy; char *cur, *next; size_t i; src_copy = match_strdup(src); if (!src_copy) return ERR_PTR(-ENOMEM); next = src_copy; while ((cur = strsep(&next, "|"))) { /* Don't accept an empty list item */ if (!(*cur)) { kfree(src_copy); return ERR_PTR(-EINVAL); } count++; } /* Don't accept an empty list */ if (!count) { kfree(src_copy); return ERR_PTR(-EINVAL); } opt_list = kzalloc(struct_size(opt_list, items, count), GFP_KERNEL); if (!opt_list) { kfree(src_copy); return ERR_PTR(-ENOMEM); } /* * strsep() has already replaced all instances of '|' with '\0', * leaving a byte sequence of NUL-terminated strings. Reference each * string with the array of items. * * IMPORTANT: Ownership of the allocated buffer is transferred from * src_copy to the first element in the items array. To free the * buffer, kfree() must only be called on the first element of the * array. */ for (i = 0, cur = src_copy; i < count; i++) { opt_list->items[i] = cur; cur = strchr(cur, '\0') + 1; } opt_list->count = count; return opt_list; } static void ima_free_rule_opt_list(struct ima_rule_opt_list *opt_list) { if (!opt_list) return; if (opt_list->count) { kfree(opt_list->items[0]); opt_list->count = 0; } kfree(opt_list); } static void ima_lsm_free_rule(struct ima_rule_entry *entry) { int i; for (i = 0; i < MAX_LSM_RULES; i++) { ima_filter_rule_free(entry->lsm[i].rule); kfree(entry->lsm[i].args_p); } } static void ima_free_rule(struct ima_rule_entry *entry) { if (!entry) return; /* * entry->template->fields may be allocated in ima_parse_rule() but that * reference is owned by the corresponding ima_template_desc element in * the defined_templates list and cannot be freed here */ kfree(entry->fsname); ima_free_rule_opt_list(entry->keyrings); ima_lsm_free_rule(entry); kfree(entry); } static struct ima_rule_entry *ima_lsm_copy_rule(struct ima_rule_entry *entry, gfp_t gfp) { struct ima_rule_entry *nentry; int i; /* * Immutable elements are copied over as pointers and data; only * lsm rules can change */ nentry = kmemdup(entry, sizeof(*nentry), gfp); if (!nentry) return NULL; memset(nentry->lsm, 0, sizeof_field(struct ima_rule_entry, lsm)); for (i = 0; i < MAX_LSM_RULES; i++) { if (!entry->lsm[i].args_p) continue; nentry->lsm[i].type = entry->lsm[i].type; nentry->lsm[i].args_p = entry->lsm[i].args_p; ima_filter_rule_init(nentry->lsm[i].type, Audit_equal, nentry->lsm[i].args_p, &nentry->lsm[i].rule, gfp); if (!nentry->lsm[i].rule) pr_warn("rule for LSM \'%s\' is undefined\n", nentry->lsm[i].args_p); } return nentry; } static int ima_lsm_update_rule(struct ima_rule_entry *entry) { int i; struct ima_rule_entry *nentry; nentry = ima_lsm_copy_rule(entry, GFP_KERNEL); if (!nentry) return -ENOMEM; list_replace_rcu(&entry->list, &nentry->list); synchronize_rcu(); /* * ima_lsm_copy_rule() shallow copied all references, except for the * LSM references, from entry to nentry so we only want to free the LSM * references and the entry itself. All other memory refrences will now * be owned by nentry. */ for (i = 0; i < MAX_LSM_RULES; i++) ima_filter_rule_free(entry->lsm[i].rule); kfree(entry); return 0; } static bool ima_rule_contains_lsm_cond(struct ima_rule_entry *entry) { int i; for (i = 0; i < MAX_LSM_RULES; i++) if (entry->lsm[i].args_p) return true; return false; } /* * The LSM policy can be reloaded, leaving the IMA LSM based rules referring * to the old, stale LSM policy. Update the IMA LSM based rules to reflect * the reloaded LSM policy. */ static void ima_lsm_update_rules(void) { struct ima_rule_entry *entry, *e; int result; list_for_each_entry_safe(entry, e, &ima_policy_rules, list) { if (!ima_rule_contains_lsm_cond(entry)) continue; result = ima_lsm_update_rule(entry); if (result) { pr_err("lsm rule update error %d\n", result); return; } } } int ima_lsm_policy_change(struct notifier_block *nb, unsigned long event, void *lsm_data) { if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; ima_lsm_update_rules(); return NOTIFY_OK; } /** * ima_match_rule_data - determine whether func_data matches the policy rule * @rule: a pointer to a rule * @func_data: data to match against the measure rule data * @cred: a pointer to a credentials structure for user validation * * Returns true if func_data matches one in the rule, false otherwise. */ static bool ima_match_rule_data(struct ima_rule_entry *rule, const char *func_data, const struct cred *cred) { const struct ima_rule_opt_list *opt_list = NULL; bool matched = false; size_t i; if ((rule->flags & IMA_UID) && !rule->uid_op(cred->uid, rule->uid)) return false; switch (rule->func) { case KEY_CHECK: if (!rule->keyrings) return true; opt_list = rule->keyrings; break; case CRITICAL_DATA: if (!rule->label) return true; opt_list = rule->label; break; default: return false; } if (!func_data) return false; for (i = 0; i < opt_list->count; i++) { if (!strcmp(opt_list->items[i], func_data)) { matched = true; break; } } return matched; } /** * ima_match_rules - determine whether an inode matches the policy rule. * @rule: a pointer to a rule * @mnt_userns: user namespace of the mount the inode was found from * @inode: a pointer to an inode * @cred: a pointer to a credentials structure for user validation * @secid: the secid of the task to be validated * @func: LIM hook identifier * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC) * @func_data: func specific data, may be NULL * * Returns true on rule match, false on failure. */ static bool ima_match_rules(struct ima_rule_entry *rule, struct user_namespace *mnt_userns, struct inode *inode, const struct cred *cred, u32 secid, enum ima_hooks func, int mask, const char *func_data) { int i; bool result = false; struct ima_rule_entry *lsm_rule = rule; bool rule_reinitialized = false; if ((rule->flags & IMA_FUNC) && (rule->func != func && func != POST_SETATTR)) return false; switch (func) { case KEY_CHECK: case CRITICAL_DATA: return ((rule->func == func) && ima_match_rule_data(rule, func_data, cred)); default: break; } if ((rule->flags & IMA_MASK) && (rule->mask != mask && func != POST_SETATTR)) return false; if ((rule->flags & IMA_INMASK) && (!(rule->mask & mask) && func != POST_SETATTR)) return false; if ((rule->flags & IMA_FSMAGIC) && rule->fsmagic != inode->i_sb->s_magic) return false; if ((rule->flags & IMA_FSNAME) && strcmp(rule->fsname, inode->i_sb->s_type->name)) return false; if ((rule->flags & IMA_FSUUID) && !uuid_equal(&rule->fsuuid, &inode->i_sb->s_uuid)) return false; if ((rule->flags & IMA_UID) && !rule->uid_op(cred->uid, rule->uid)) return false; if (rule->flags & IMA_EUID) { if (has_capability_noaudit(current, CAP_SETUID)) { if (!rule->uid_op(cred->euid, rule->uid) && !rule->uid_op(cred->suid, rule->uid) && !rule->uid_op(cred->uid, rule->uid)) return false; } else if (!rule->uid_op(cred->euid, rule->uid)) return false; } if ((rule->flags & IMA_FOWNER) && !rule->fowner_op(i_uid_into_mnt(mnt_userns, inode), rule->fowner)) return false; for (i = 0; i < MAX_LSM_RULES; i++) { int rc = 0; u32 osid; if (!lsm_rule->lsm[i].rule) { if (!lsm_rule->lsm[i].args_p) continue; else return false; } retry: switch (i) { case LSM_OBJ_USER: case LSM_OBJ_ROLE: case LSM_OBJ_TYPE: security_inode_getsecid(inode, &osid); rc = ima_filter_rule_match(osid, lsm_rule->lsm[i].type, Audit_equal, lsm_rule->lsm[i].rule); break; case LSM_SUBJ_USER: case LSM_SUBJ_ROLE: case LSM_SUBJ_TYPE: rc = ima_filter_rule_match(secid, lsm_rule->lsm[i].type, Audit_equal, lsm_rule->lsm[i].rule); break; default: break; } if (rc == -ESTALE && !rule_reinitialized) { lsm_rule = ima_lsm_copy_rule(rule, GFP_ATOMIC); if (lsm_rule) { rule_reinitialized = true; goto retry; } } if (!rc) { result = false; goto out; } } result = true; out: if (rule_reinitialized) { for (i = 0; i < MAX_LSM_RULES; i++) ima_filter_rule_free(lsm_rule->lsm[i].rule); kfree(lsm_rule); } return result; } /* * In addition to knowing that we need to appraise the file in general, * we need to differentiate between calling hooks, for hook specific rules. */ static int get_subaction(struct ima_rule_entry *rule, enum ima_hooks func) { if (!(rule->flags & IMA_FUNC)) return IMA_FILE_APPRAISE; switch (func) { case MMAP_CHECK: return IMA_MMAP_APPRAISE; case BPRM_CHECK: return IMA_BPRM_APPRAISE; case CREDS_CHECK: return IMA_CREDS_APPRAISE; case FILE_CHECK: case POST_SETATTR: return IMA_FILE_APPRAISE; case MODULE_CHECK ... MAX_CHECK - 1: default: return IMA_READ_APPRAISE; } } /** * ima_match_policy - decision based on LSM and other conditions * @mnt_userns: user namespace of the mount the inode was found from * @inode: pointer to an inode for which the policy decision is being made * @cred: pointer to a credentials structure for which the policy decision is * being made * @secid: LSM secid of the task to be validated * @func: IMA hook identifier * @mask: requested action (MAY_READ | MAY_WRITE | MAY_APPEND | MAY_EXEC) * @flags: IMA actions to consider (e.g. IMA_MEASURE | IMA_APPRAISE) * @pcr: set the pcr to extend * @template_desc: the template that should be used for this rule * @func_data: func specific data, may be NULL * @allowed_algos: allowlist of hash algorithms for the IMA xattr * * Measure decision based on func/mask/fsmagic and LSM(subj/obj/type) * conditions. * * Since the IMA policy may be updated multiple times we need to lock the * list when walking it. Reads are many orders of magnitude more numerous * than writes so ima_match_policy() is classical RCU candidate. */ int ima_match_policy(struct user_namespace *mnt_userns, struct inode *inode, const struct cred *cred, u32 secid, enum ima_hooks func, int mask, int flags, int *pcr, struct ima_template_desc **template_desc, const char *func_data, unsigned int *allowed_algos) { struct ima_rule_entry *entry; int action = 0, actmask = flags | (flags << 1); struct list_head *ima_rules_tmp; if (template_desc && !*template_desc) *template_desc = ima_template_desc_current(); rcu_read_lock(); ima_rules_tmp = rcu_dereference(ima_rules); list_for_each_entry_rcu(entry, ima_rules_tmp, list) { if (!(entry->action & actmask)) continue; if (!ima_match_rules(entry, mnt_userns, inode, cred, secid, func, mask, func_data)) continue; action |= entry->flags & IMA_ACTION_FLAGS; action |= entry->action & IMA_DO_MASK; if (entry->action & IMA_APPRAISE) { action |= get_subaction(entry, func); action &= ~IMA_HASH; if (ima_fail_unverifiable_sigs) action |= IMA_FAIL_UNVERIFIABLE_SIGS; if (allowed_algos && entry->flags & IMA_VALIDATE_ALGOS) *allowed_algos = entry->allowed_algos; } if (entry->action & IMA_DO_MASK) actmask &= ~(entry->action | entry->action << 1); else actmask &= ~(entry->action | entry->action >> 1); if ((pcr) && (entry->flags & IMA_PCR)) *pcr = entry->pcr; if (template_desc && entry->template) *template_desc = entry->template; if (!actmask) break; } rcu_read_unlock(); return action; } /** * ima_update_policy_flags() - Update global IMA variables * * Update ima_policy_flag and ima_setxattr_allowed_hash_algorithms * based on the currently loaded policy. * * With ima_policy_flag, the decision to short circuit out of a function * or not call the function in the first place can be made earlier. * * With ima_setxattr_allowed_hash_algorithms, the policy can restrict the * set of hash algorithms accepted when updating the security.ima xattr of * a file. * * Context: called after a policy update and at system initialization. */ void ima_update_policy_flags(void) { struct ima_rule_entry *entry; int new_policy_flag = 0; struct list_head *ima_rules_tmp; rcu_read_lock(); ima_rules_tmp = rcu_dereference(ima_rules); list_for_each_entry_rcu(entry, ima_rules_tmp, list) { /* * SETXATTR_CHECK rules do not implement a full policy check * because rule checking would probably have an important * performance impact on setxattr(). As a consequence, only one * SETXATTR_CHECK can be active at a given time. * Because we want to preserve that property, we set out to use * atomic_cmpxchg. Either: * - the atomic was non-zero: a setxattr hash policy is * already enforced, we do nothing * - the atomic was zero: no setxattr policy was set, enable * the setxattr hash policy */ if (entry->func == SETXATTR_CHECK) { atomic_cmpxchg(&ima_setxattr_allowed_hash_algorithms, 0, entry->allowed_algos); /* SETXATTR_CHECK doesn't impact ima_policy_flag */ continue; } if (entry->action & IMA_DO_MASK) new_policy_flag |= entry->action; } rcu_read_unlock(); ima_appraise |= (build_ima_appraise | temp_ima_appraise); if (!ima_appraise) new_policy_flag &= ~IMA_APPRAISE; ima_policy_flag = new_policy_flag; } static int ima_appraise_flag(enum ima_hooks func) { if (func == MODULE_CHECK) return IMA_APPRAISE_MODULES; else if (func == FIRMWARE_CHECK) return IMA_APPRAISE_FIRMWARE; else if (func == POLICY_CHECK) return IMA_APPRAISE_POLICY; else if (func == KEXEC_KERNEL_CHECK) return IMA_APPRAISE_KEXEC; return 0; } static void add_rules(struct ima_rule_entry *entries, int count, enum policy_rule_list policy_rule) { int i = 0; for (i = 0; i < count; i++) { struct ima_rule_entry *entry; if (policy_rule & IMA_DEFAULT_POLICY) list_add_tail(&entries[i].list, &ima_default_rules); if (policy_rule & IMA_CUSTOM_POLICY) { entry = kmemdup(&entries[i], sizeof(*entry), GFP_KERNEL); if (!entry) continue; list_add_tail(&entry->list, &ima_policy_rules); } if (entries[i].action == APPRAISE) { if (entries != build_appraise_rules) temp_ima_appraise |= ima_appraise_flag(entries[i].func); else build_ima_appraise |= ima_appraise_flag(entries[i].func); } } } static int ima_parse_rule(char *rule, struct ima_rule_entry *entry); static int __init ima_init_arch_policy(void) { const char * const *arch_rules; const char * const *rules; int arch_entries = 0; int i = 0; arch_rules = arch_get_ima_policy(); if (!arch_rules) return arch_entries; /* Get number of rules */ for (rules = arch_rules; *rules != NULL; rules++) arch_entries++; arch_policy_entry = kcalloc(arch_entries + 1, sizeof(*arch_policy_entry), GFP_KERNEL); if (!arch_policy_entry) return 0; /* Convert each policy string rules to struct ima_rule_entry format */ for (rules = arch_rules, i = 0; *rules != NULL; rules++) { char rule[255]; int result; result = strlcpy(rule, *rules, sizeof(rule)); INIT_LIST_HEAD(&arch_policy_entry[i].list); result = ima_parse_rule(rule, &arch_policy_entry[i]); if (result) { pr_warn("Skipping unknown architecture policy rule: %s\n", rule); memset(&arch_policy_entry[i], 0, sizeof(*arch_policy_entry)); continue; } i++; } return i; } /** * ima_init_policy - initialize the default measure rules. * * ima_rules points to either the ima_default_rules or the * the new ima_policy_rules. */ void __init ima_init_policy(void) { int build_appraise_entries, arch_entries; /* if !ima_policy, we load NO default rules */ if (ima_policy) add_rules(dont_measure_rules, ARRAY_SIZE(dont_measure_rules), IMA_DEFAULT_POLICY); switch (ima_policy) { case ORIGINAL_TCB: add_rules(original_measurement_rules, ARRAY_SIZE(original_measurement_rules), IMA_DEFAULT_POLICY); break; case DEFAULT_TCB: add_rules(default_measurement_rules, ARRAY_SIZE(default_measurement_rules), IMA_DEFAULT_POLICY); break; default: break; } /* * Based on runtime secure boot flags, insert arch specific measurement * and appraise rules requiring file signatures for both the initial * and custom policies, prior to other appraise rules. * (Highest priority) */ arch_entries = ima_init_arch_policy(); if (!arch_entries) pr_info("No architecture policies found\n"); else add_rules(arch_policy_entry, arch_entries, IMA_DEFAULT_POLICY | IMA_CUSTOM_POLICY); /* * Insert the builtin "secure_boot" policy rules requiring file * signatures, prior to other appraise rules. */ if (ima_use_secure_boot) add_rules(secure_boot_rules, ARRAY_SIZE(secure_boot_rules), IMA_DEFAULT_POLICY); /* * Insert the build time appraise rules requiring file signatures * for both the initial and custom policies, prior to other appraise * rules. As the secure boot rules includes all of the build time * rules, include either one or the other set of rules, but not both. */ build_appraise_entries = ARRAY_SIZE(build_appraise_rules); if (build_appraise_entries) { if (ima_use_secure_boot) add_rules(build_appraise_rules, build_appraise_entries, IMA_CUSTOM_POLICY); else add_rules(build_appraise_rules, build_appraise_entries, IMA_DEFAULT_POLICY | IMA_CUSTOM_POLICY); } if (ima_use_appraise_tcb) add_rules(default_appraise_rules, ARRAY_SIZE(default_appraise_rules), IMA_DEFAULT_POLICY); if (ima_use_critical_data) add_rules(critical_data_rules, ARRAY_SIZE(critical_data_rules), IMA_DEFAULT_POLICY); atomic_set(&ima_setxattr_allowed_hash_algorithms, 0); ima_update_policy_flags(); } /* Make sure we have a valid policy, at least containing some rules. */ int ima_check_policy(void) { if (list_empty(&ima_temp_rules)) return -EINVAL; return 0; } /** * ima_update_policy - update default_rules with new measure rules * * Called on file .release to update the default rules with a complete new * policy. What we do here is to splice ima_policy_rules and ima_temp_rules so * they make a queue. The policy may be updated multiple times and this is the * RCU updater. * * Policy rules are never deleted so ima_policy_flag gets zeroed only once when * we switch from the default policy to user defined. */ void ima_update_policy(void) { struct list_head *policy = &ima_policy_rules; list_splice_tail_init_rcu(&ima_temp_rules, policy, synchronize_rcu); if (ima_rules != (struct list_head __rcu *)policy) { ima_policy_flag = 0; rcu_assign_pointer(ima_rules, policy); /* * IMA architecture specific policy rules are specified * as strings and converted to an array of ima_entry_rules * on boot. After loading a custom policy, free the * architecture specific rules stored as an array. */ kfree(arch_policy_entry); } ima_update_policy_flags(); /* Custom IMA policy has been loaded */ ima_process_queued_keys(); } /* Keep the enumeration in sync with the policy_tokens! */ enum { Opt_measure, Opt_dont_measure, Opt_appraise, Opt_dont_appraise, Opt_audit, Opt_hash, Opt_dont_hash, Opt_obj_user, Opt_obj_role, Opt_obj_type, Opt_subj_user, Opt_subj_role, Opt_subj_type, Opt_func, Opt_mask, Opt_fsmagic, Opt_fsname, Opt_fsuuid, Opt_uid_eq, Opt_euid_eq, Opt_fowner_eq, Opt_uid_gt, Opt_euid_gt, Opt_fowner_gt, Opt_uid_lt, Opt_euid_lt, Opt_fowner_lt, Opt_appraise_type, Opt_appraise_flag, Opt_appraise_algos, Opt_permit_directio, Opt_pcr, Opt_template, Opt_keyrings, Opt_label, Opt_err }; static const match_table_t policy_tokens = { {Opt_measure, "measure"}, {Opt_dont_measure, "dont_measure"}, {Opt_appraise, "appraise"}, {Opt_dont_appraise, "dont_appraise"}, {Opt_audit, "audit"}, {Opt_hash, "hash"}, {Opt_dont_hash, "dont_hash"}, {Opt_obj_user, "obj_user=%s"}, {Opt_obj_role, "obj_role=%s"}, {Opt_obj_type, "obj_type=%s"}, {Opt_subj_user, "subj_user=%s"}, {Opt_subj_role, "subj_role=%s"}, {Opt_subj_type, "subj_type=%s"}, {Opt_func, "func=%s"}, {Opt_mask, "mask=%s"}, {Opt_fsmagic, "fsmagic=%s"}, {Opt_fsname, "fsname=%s"}, {Opt_fsuuid, "fsuuid=%s"}, {Opt_uid_eq, "uid=%s"}, {Opt_euid_eq, "euid=%s"}, {Opt_fowner_eq, "fowner=%s"}, {Opt_uid_gt, "uid>%s"}, {Opt_euid_gt, "euid>%s"}, {Opt_fowner_gt, "fowner>%s"}, {Opt_uid_lt, "uid<%s"}, {Opt_euid_lt, "euid<%s"}, {Opt_fowner_lt, "fowner<%s"}, {Opt_appraise_type, "appraise_type=%s"}, {Opt_appraise_flag, "appraise_flag=%s"}, {Opt_appraise_algos, "appraise_algos=%s"}, {Opt_permit_directio, "permit_directio"}, {Opt_pcr, "pcr=%s"}, {Opt_template, "template=%s"}, {Opt_keyrings, "keyrings=%s"}, {Opt_label, "label=%s"}, {Opt_err, NULL} }; static int ima_lsm_rule_init(struct ima_rule_entry *entry, substring_t *args, int lsm_rule, int audit_type) { int result; if (entry->lsm[lsm_rule].rule) return -EINVAL; entry->lsm[lsm_rule].args_p = match_strdup(args); if (!entry->lsm[lsm_rule].args_p) return -ENOMEM; entry->lsm[lsm_rule].type = audit_type; result = ima_filter_rule_init(entry->lsm[lsm_rule].type, Audit_equal, entry->lsm[lsm_rule].args_p, &entry->lsm[lsm_rule].rule, GFP_KERNEL); if (!entry->lsm[lsm_rule].rule) { pr_warn("rule for LSM \'%s\' is undefined\n", entry->lsm[lsm_rule].args_p); if (ima_rules == (struct list_head __rcu *)(&ima_default_rules)) { kfree(entry->lsm[lsm_rule].args_p); entry->lsm[lsm_rule].args_p = NULL; result = -EINVAL; } else result = 0; } return result; } static void ima_log_string_op(struct audit_buffer *ab, char *key, char *value, bool (*rule_operator)(kuid_t, kuid_t)) { if (!ab) return; if (rule_operator == &uid_gt) audit_log_format(ab, "%s>", key); else if (rule_operator == &uid_lt) audit_log_format(ab, "%s<", key); else audit_log_format(ab, "%s=", key); audit_log_format(ab, "%s ", value); } static void ima_log_string(struct audit_buffer *ab, char *key, char *value) { ima_log_string_op(ab, key, value, NULL); } /* * Validating the appended signature included in the measurement list requires * the file hash calculated without the appended signature (i.e., the 'd-modsig' * field). Therefore, notify the user if they have the 'modsig' field but not * the 'd-modsig' field in the template. */ static void check_template_modsig(const struct ima_template_desc *template) { #define MSG "template with 'modsig' field also needs 'd-modsig' field\n" bool has_modsig, has_dmodsig; static bool checked; int i; /* We only need to notify the user once. */ if (checked) return; has_modsig = has_dmodsig = false; for (i = 0; i < template->num_fields; i++) { if (!strcmp(template->fields[i]->field_id, "modsig")) has_modsig = true; else if (!strcmp(template->fields[i]->field_id, "d-modsig")) has_dmodsig = true; } if (has_modsig && !has_dmodsig) pr_notice(MSG); checked = true; #undef MSG } static bool ima_validate_rule(struct ima_rule_entry *entry) { /* Ensure that the action is set and is compatible with the flags */ if (entry->action == UNKNOWN) return false; if (entry->action != MEASURE && entry->flags & IMA_PCR) return false; if (entry->action != APPRAISE && entry->flags & (IMA_DIGSIG_REQUIRED | IMA_MODSIG_ALLOWED | IMA_CHECK_BLACKLIST | IMA_VALIDATE_ALGOS)) return false; /* * The IMA_FUNC bit must be set if and only if there's a valid hook * function specified, and vice versa. Enforcing this property allows * for the NONE case below to validate a rule without an explicit hook * function. */ if (((entry->flags & IMA_FUNC) && entry->func == NONE) || (!(entry->flags & IMA_FUNC) && entry->func != NONE)) return false; /* * Ensure that the hook function is compatible with the other * components of the rule */ switch (entry->func) { case NONE: case FILE_CHECK: case MMAP_CHECK: case BPRM_CHECK: case CREDS_CHECK: case POST_SETATTR: case FIRMWARE_CHECK: case POLICY_CHECK: if (entry->flags & ~(IMA_FUNC | IMA_MASK | IMA_FSMAGIC | IMA_UID | IMA_FOWNER | IMA_FSUUID | IMA_INMASK | IMA_EUID | IMA_PCR | IMA_FSNAME | IMA_DIGSIG_REQUIRED | IMA_PERMIT_DIRECTIO | IMA_VALIDATE_ALGOS)) return false; break; case MODULE_CHECK: case KEXEC_KERNEL_CHECK: case KEXEC_INITRAMFS_CHECK: if (entry->flags & ~(IMA_FUNC | IMA_MASK | IMA_FSMAGIC | IMA_UID | IMA_FOWNER | IMA_FSUUID | IMA_INMASK | IMA_EUID | IMA_PCR | IMA_FSNAME | IMA_DIGSIG_REQUIRED | IMA_PERMIT_DIRECTIO | IMA_MODSIG_ALLOWED | IMA_CHECK_BLACKLIST | IMA_VALIDATE_ALGOS)) return false; break; case KEXEC_CMDLINE: if (entry->action & ~(MEASURE | DONT_MEASURE)) return false; if (entry->flags & ~(IMA_FUNC | IMA_FSMAGIC | IMA_UID | IMA_FOWNER | IMA_FSUUID | IMA_EUID | IMA_PCR | IMA_FSNAME)) return false; break; case KEY_CHECK: if (entry->action & ~(MEASURE | DONT_MEASURE)) return false; if (entry->flags & ~(IMA_FUNC | IMA_UID | IMA_PCR | IMA_KEYRINGS)) return false; if (ima_rule_contains_lsm_cond(entry)) return false; break; case CRITICAL_DATA: if (entry->action & ~(MEASURE | DONT_MEASURE)) return false; if (entry->flags & ~(IMA_FUNC | IMA_UID | IMA_PCR | IMA_LABEL)) return false; if (ima_rule_contains_lsm_cond(entry)) return false; break; case SETXATTR_CHECK: /* any action other than APPRAISE is unsupported */ if (entry->action != APPRAISE) return false; /* SETXATTR_CHECK requires an appraise_algos parameter */ if (!(entry->flags & IMA_VALIDATE_ALGOS)) return false; /* * full policies are not supported, they would have too * much of a performance impact */ if (entry->flags & ~(IMA_FUNC | IMA_VALIDATE_ALGOS)) return false; break; default: return false; } /* Ensure that combinations of flags are compatible with each other */ if (entry->flags & IMA_CHECK_BLACKLIST && !(entry->flags & IMA_MODSIG_ALLOWED)) return false; return true; } static unsigned int ima_parse_appraise_algos(char *arg) { unsigned int res = 0; int idx; char *token; while ((token = strsep(&arg, ",")) != NULL) { idx = match_string(hash_algo_name, HASH_ALGO__LAST, token); if (idx < 0) { pr_err("unknown hash algorithm \"%s\"", token); return 0; } if (!crypto_has_alg(hash_algo_name[idx], 0, 0)) { pr_err("unavailable hash algorithm \"%s\", check your kernel configuration", token); return 0; } /* Add the hash algorithm to the 'allowed' bitfield */ res |= (1U << idx); } return res; } static int ima_parse_rule(char *rule, struct ima_rule_entry *entry) { struct audit_buffer *ab; char *from; char *p; bool uid_token; struct ima_template_desc *template_desc; int result = 0; ab = integrity_audit_log_start(audit_context(), GFP_KERNEL, AUDIT_INTEGRITY_POLICY_RULE); entry->uid = INVALID_UID; entry->fowner = INVALID_UID; entry->uid_op = &uid_eq; entry->fowner_op = &uid_eq; entry->action = UNKNOWN; while ((p = strsep(&rule, " \t")) != NULL) { substring_t args[MAX_OPT_ARGS]; int token; unsigned long lnum; if (result < 0) break; if ((*p == '\0') || (*p == ' ') || (*p == '\t')) continue; token = match_token(p, policy_tokens, args); switch (token) { case Opt_measure: ima_log_string(ab, "action", "measure"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = MEASURE; break; case Opt_dont_measure: ima_log_string(ab, "action", "dont_measure"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = DONT_MEASURE; break; case Opt_appraise: ima_log_string(ab, "action", "appraise"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = APPRAISE; break; case Opt_dont_appraise: ima_log_string(ab, "action", "dont_appraise"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = DONT_APPRAISE; break; case Opt_audit: ima_log_string(ab, "action", "audit"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = AUDIT; break; case Opt_hash: ima_log_string(ab, "action", "hash"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = HASH; break; case Opt_dont_hash: ima_log_string(ab, "action", "dont_hash"); if (entry->action != UNKNOWN) result = -EINVAL; entry->action = DONT_HASH; break; case Opt_func: ima_log_string(ab, "func", args[0].from); if (entry->func) result = -EINVAL; if (strcmp(args[0].from, "FILE_CHECK") == 0) entry->func = FILE_CHECK; /* PATH_CHECK is for backwards compat */ else if (strcmp(args[0].from, "PATH_CHECK") == 0) entry->func = FILE_CHECK; else if (strcmp(args[0].from, "MODULE_CHECK") == 0) entry->func = MODULE_CHECK; else if (strcmp(args[0].from, "FIRMWARE_CHECK") == 0) entry->func = FIRMWARE_CHECK; else if ((strcmp(args[0].from, "FILE_MMAP") == 0) || (strcmp(args[0].from, "MMAP_CHECK") == 0)) entry->func = MMAP_CHECK; else if (strcmp(args[0].from, "BPRM_CHECK") == 0) entry->func = BPRM_CHECK; else if (strcmp(args[0].from, "CREDS_CHECK") == 0) entry->func = CREDS_CHECK; else if (strcmp(args[0].from, "KEXEC_KERNEL_CHECK") == 0) entry->func = KEXEC_KERNEL_CHECK; else if (strcmp(args[0].from, "KEXEC_INITRAMFS_CHECK") == 0) entry->func = KEXEC_INITRAMFS_CHECK; else if (strcmp(args[0].from, "POLICY_CHECK") == 0) entry->func = POLICY_CHECK; else if (strcmp(args[0].from, "KEXEC_CMDLINE") == 0) entry->func = KEXEC_CMDLINE; else if (IS_ENABLED(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) && strcmp(args[0].from, "KEY_CHECK") == 0) entry->func = KEY_CHECK; else if (strcmp(args[0].from, "CRITICAL_DATA") == 0) entry->func = CRITICAL_DATA; else if (strcmp(args[0].from, "SETXATTR_CHECK") == 0) entry->func = SETXATTR_CHECK; else result = -EINVAL; if (!result) entry->flags |= IMA_FUNC; break; case Opt_mask: ima_log_string(ab, "mask", args[0].from); if (entry->mask) result = -EINVAL; from = args[0].from; if (*from == '^') from++; if ((strcmp(from, "MAY_EXEC")) == 0) entry->mask = MAY_EXEC; else if (strcmp(from, "MAY_WRITE") == 0) entry->mask = MAY_WRITE; else if (strcmp(from, "MAY_READ") == 0) entry->mask = MAY_READ; else if (strcmp(from, "MAY_APPEND") == 0) entry->mask = MAY_APPEND; else result = -EINVAL; if (!result) entry->flags |= (*args[0].from == '^') ? IMA_INMASK : IMA_MASK; break; case Opt_fsmagic: ima_log_string(ab, "fsmagic", args[0].from); if (entry->fsmagic) { result = -EINVAL; break; } result = kstrtoul(args[0].from, 16, &entry->fsmagic); if (!result) entry->flags |= IMA_FSMAGIC; break; case Opt_fsname: ima_log_string(ab, "fsname", args[0].from); entry->fsname = kstrdup(args[0].from, GFP_KERNEL); if (!entry->fsname) { result = -ENOMEM; break; } result = 0; entry->flags |= IMA_FSNAME; break; case Opt_keyrings: ima_log_string(ab, "keyrings", args[0].from); if (!IS_ENABLED(CONFIG_IMA_MEASURE_ASYMMETRIC_KEYS) || entry->keyrings) { result = -EINVAL; break; } entry->keyrings = ima_alloc_rule_opt_list(args); if (IS_ERR(entry->keyrings)) { result = PTR_ERR(entry->keyrings); entry->keyrings = NULL; break; } entry->flags |= IMA_KEYRINGS; break; case Opt_label: ima_log_string(ab, "label", args[0].from); if (entry->label) { result = -EINVAL; break; } entry->label = ima_alloc_rule_opt_list(args); if (IS_ERR(entry->label)) { result = PTR_ERR(entry->label); entry->label = NULL; break; } entry->flags |= IMA_LABEL; break; case Opt_fsuuid: ima_log_string(ab, "fsuuid", args[0].from); if (!uuid_is_null(&entry->fsuuid)) { result = -EINVAL; break; } result = uuid_parse(args[0].from, &entry->fsuuid); if (!result) entry->flags |= IMA_FSUUID; break; case Opt_uid_gt: case Opt_euid_gt: entry->uid_op = &uid_gt; fallthrough; case Opt_uid_lt: case Opt_euid_lt: if ((token == Opt_uid_lt) || (token == Opt_euid_lt)) entry->uid_op = &uid_lt; fallthrough; case Opt_uid_eq: case Opt_euid_eq: uid_token = (token == Opt_uid_eq) || (token == Opt_uid_gt) || (token == Opt_uid_lt); ima_log_string_op(ab, uid_token ? "uid" : "euid", args[0].from, entry->uid_op); if (uid_valid(entry->uid)) { result = -EINVAL; break; } result = kstrtoul(args[0].from, 10, &lnum); if (!result) { entry->uid = make_kuid(current_user_ns(), (uid_t) lnum); if (!uid_valid(entry->uid) || (uid_t)lnum != lnum) result = -EINVAL; else entry->flags |= uid_token ? IMA_UID : IMA_EUID; } break; case Opt_fowner_gt: entry->fowner_op = &uid_gt; fallthrough; case Opt_fowner_lt: if (token == Opt_fowner_lt) entry->fowner_op = &uid_lt; fallthrough; case Opt_fowner_eq: ima_log_string_op(ab, "fowner", args[0].from, entry->fowner_op); if (uid_valid(entry->fowner)) { result = -EINVAL; break; } result = kstrtoul(args[0].from, 10, &lnum); if (!result) { entry->fowner = make_kuid(current_user_ns(), (uid_t)lnum); if (!uid_valid(entry->fowner) || (((uid_t)lnum) != lnum)) result = -EINVAL; else entry->flags |= IMA_FOWNER; } break; case Opt_obj_user: ima_log_string(ab, "obj_user", args[0].from); result = ima_lsm_rule_init(entry, args, LSM_OBJ_USER, AUDIT_OBJ_USER); break; case Opt_obj_role: ima_log_string(ab, "obj_role", args[0].from); result = ima_lsm_rule_init(entry, args, LSM_OBJ_ROLE, AUDIT_OBJ_ROLE); break; case Opt_obj_type: ima_log_string(ab, "obj_type", args[0].from); result = ima_lsm_rule_init(entry, args, LSM_OBJ_TYPE, AUDIT_OBJ_TYPE); break; case Opt_subj_user: ima_log_string(ab, "subj_user", args[0].from); result = ima_lsm_rule_init(entry, args, LSM_SUBJ_USER, AUDIT_SUBJ_USER); break; case Opt_subj_role: ima_log_string(ab, "subj_role", args[0].from); result = ima_lsm_rule_init(entry, args, LSM_SUBJ_ROLE, AUDIT_SUBJ_ROLE); break; case Opt_subj_type: ima_log_string(ab, "subj_type", args[0].from); result = ima_lsm_rule_init(entry, args, LSM_SUBJ_TYPE, AUDIT_SUBJ_TYPE); break; case Opt_appraise_type: ima_log_string(ab, "appraise_type", args[0].from); if ((strcmp(args[0].from, "imasig")) == 0) entry->flags |= IMA_DIGSIG_REQUIRED; else if (IS_ENABLED(CONFIG_IMA_APPRAISE_MODSIG) && strcmp(args[0].from, "imasig|modsig") == 0) entry->flags |= IMA_DIGSIG_REQUIRED | IMA_MODSIG_ALLOWED; else result = -EINVAL; break; case Opt_appraise_flag: ima_log_string(ab, "appraise_flag", args[0].from); if (IS_ENABLED(CONFIG_IMA_APPRAISE_MODSIG) && strstr(args[0].from, "blacklist")) entry->flags |= IMA_CHECK_BLACKLIST; else result = -EINVAL; break; case Opt_appraise_algos: ima_log_string(ab, "appraise_algos", args[0].from); if (entry->allowed_algos) { result = -EINVAL; break; } entry->allowed_algos = ima_parse_appraise_algos(args[0].from); /* invalid or empty list of algorithms */ if (!entry->allowed_algos) { result = -EINVAL; break; } entry->flags |= IMA_VALIDATE_ALGOS; break; case Opt_permit_directio: entry->flags |= IMA_PERMIT_DIRECTIO; break; case Opt_pcr: ima_log_string(ab, "pcr", args[0].from); result = kstrtoint(args[0].from, 10, &entry->pcr); if (result || INVALID_PCR(entry->pcr)) result = -EINVAL; else entry->flags |= IMA_PCR; break; case Opt_template: ima_log_string(ab, "template", args[0].from); if (entry->action != MEASURE) { result = -EINVAL; break; } template_desc = lookup_template_desc(args[0].from); if (!template_desc || entry->template) { result = -EINVAL; break; } /* * template_desc_init_fields() does nothing if * the template is already initialised, so * it's safe to do this unconditionally */ template_desc_init_fields(template_desc->fmt, &(template_desc->fields), &(template_desc->num_fields)); entry->template = template_desc; break; case Opt_err: ima_log_string(ab, "UNKNOWN", p); result = -EINVAL; break; } } if (!result && !ima_validate_rule(entry)) result = -EINVAL; else if (entry->action == APPRAISE) temp_ima_appraise |= ima_appraise_flag(entry->func); if (!result && entry->flags & IMA_MODSIG_ALLOWED) { template_desc = entry->template ? entry->template : ima_template_desc_current(); check_template_modsig(template_desc); } audit_log_format(ab, "res=%d", !result); audit_log_end(ab); return result; } /** * ima_parse_add_rule - add a rule to ima_policy_rules * @rule: ima measurement policy rule * * Avoid locking by allowing just one writer at a time in ima_write_policy() * Returns the length of the rule parsed, an error code on failure */ ssize_t ima_parse_add_rule(char *rule) { static const char op[] = "update_policy"; char *p; struct ima_rule_entry *entry; ssize_t result, len; int audit_info = 0; p = strsep(&rule, "\n"); len = strlen(p) + 1; p += strspn(p, " \t"); if (*p == '#' || *p == '\0') return len; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL, op, "-ENOMEM", -ENOMEM, audit_info); return -ENOMEM; } INIT_LIST_HEAD(&entry->list); result = ima_parse_rule(p, entry); if (result) { ima_free_rule(entry); integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL, op, "invalid-policy", result, audit_info); return result; } list_add_tail(&entry->list, &ima_temp_rules); return len; } /** * ima_delete_rules() called to cleanup invalid in-flight policy. * We don't need locking as we operate on the temp list, which is * different from the active one. There is also only one user of * ima_delete_rules() at a time. */ void ima_delete_rules(void) { struct ima_rule_entry *entry, *tmp; temp_ima_appraise = 0; list_for_each_entry_safe(entry, tmp, &ima_temp_rules, list) { list_del(&entry->list); ima_free_rule(entry); } } #define __ima_hook_stringify(func, str) (#func), const char *const func_tokens[] = { __ima_hooks(__ima_hook_stringify) }; #ifdef CONFIG_IMA_READ_POLICY enum { mask_exec = 0, mask_write, mask_read, mask_append }; static const char *const mask_tokens[] = { "^MAY_EXEC", "^MAY_WRITE", "^MAY_READ", "^MAY_APPEND" }; void *ima_policy_start(struct seq_file *m, loff_t *pos) { loff_t l = *pos; struct ima_rule_entry *entry; struct list_head *ima_rules_tmp; rcu_read_lock(); ima_rules_tmp = rcu_dereference(ima_rules); list_for_each_entry_rcu(entry, ima_rules_tmp, list) { if (!l--) { rcu_read_unlock(); return entry; } } rcu_read_unlock(); return NULL; } void *ima_policy_next(struct seq_file *m, void *v, loff_t *pos) { struct ima_rule_entry *entry = v; rcu_read_lock(); entry = list_entry_rcu(entry->list.next, struct ima_rule_entry, list); rcu_read_unlock(); (*pos)++; return (&entry->list == &ima_default_rules || &entry->list == &ima_policy_rules) ? NULL : entry; } void ima_policy_stop(struct seq_file *m, void *v) { } #define pt(token) policy_tokens[token].pattern #define mt(token) mask_tokens[token] /* * policy_func_show - display the ima_hooks policy rule */ static void policy_func_show(struct seq_file *m, enum ima_hooks func) { if (func > 0 && func < MAX_CHECK) seq_printf(m, "func=%s ", func_tokens[func]); else seq_printf(m, "func=%d ", func); } static void ima_show_rule_opt_list(struct seq_file *m, const struct ima_rule_opt_list *opt_list) { size_t i; for (i = 0; i < opt_list->count; i++) seq_printf(m, "%s%s", i ? "|" : "", opt_list->items[i]); } static void ima_policy_show_appraise_algos(struct seq_file *m, unsigned int allowed_hashes) { int idx, list_size = 0; for (idx = 0; idx < HASH_ALGO__LAST; idx++) { if (!(allowed_hashes & (1U << idx))) continue; /* only add commas if the list contains multiple entries */ if (list_size++) seq_puts(m, ","); seq_puts(m, hash_algo_name[idx]); } } int ima_policy_show(struct seq_file *m, void *v) { struct ima_rule_entry *entry = v; int i; char tbuf[64] = {0,}; int offset = 0; rcu_read_lock(); /* Do not print rules with inactive LSM labels */ for (i = 0; i < MAX_LSM_RULES; i++) { if (entry->lsm[i].args_p && !entry->lsm[i].rule) { rcu_read_unlock(); return 0; } } if (entry->action & MEASURE) seq_puts(m, pt(Opt_measure)); if (entry->action & DONT_MEASURE) seq_puts(m, pt(Opt_dont_measure)); if (entry->action & APPRAISE) seq_puts(m, pt(Opt_appraise)); if (entry->action & DONT_APPRAISE) seq_puts(m, pt(Opt_dont_appraise)); if (entry->action & AUDIT) seq_puts(m, pt(Opt_audit)); if (entry->action & HASH) seq_puts(m, pt(Opt_hash)); if (entry->action & DONT_HASH) seq_puts(m, pt(Opt_dont_hash)); seq_puts(m, " "); if (entry->flags & IMA_FUNC) policy_func_show(m, entry->func); if ((entry->flags & IMA_MASK) || (entry->flags & IMA_INMASK)) { if (entry->flags & IMA_MASK) offset = 1; if (entry->mask & MAY_EXEC) seq_printf(m, pt(Opt_mask), mt(mask_exec) + offset); if (entry->mask & MAY_WRITE) seq_printf(m, pt(Opt_mask), mt(mask_write) + offset); if (entry->mask & MAY_READ) seq_printf(m, pt(Opt_mask), mt(mask_read) + offset); if (entry->mask & MAY_APPEND) seq_printf(m, pt(Opt_mask), mt(mask_append) + offset); seq_puts(m, " "); } if (entry->flags & IMA_FSMAGIC) { snprintf(tbuf, sizeof(tbuf), "0x%lx", entry->fsmagic); seq_printf(m, pt(Opt_fsmagic), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_FSNAME) { snprintf(tbuf, sizeof(tbuf), "%s", entry->fsname); seq_printf(m, pt(Opt_fsname), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_KEYRINGS) { seq_puts(m, "keyrings="); ima_show_rule_opt_list(m, entry->keyrings); seq_puts(m, " "); } if (entry->flags & IMA_LABEL) { seq_puts(m, "label="); ima_show_rule_opt_list(m, entry->label); seq_puts(m, " "); } if (entry->flags & IMA_PCR) { snprintf(tbuf, sizeof(tbuf), "%d", entry->pcr); seq_printf(m, pt(Opt_pcr), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_FSUUID) { seq_printf(m, "fsuuid=%pU", &entry->fsuuid); seq_puts(m, " "); } if (entry->flags & IMA_UID) { snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->uid)); if (entry->uid_op == &uid_gt) seq_printf(m, pt(Opt_uid_gt), tbuf); else if (entry->uid_op == &uid_lt) seq_printf(m, pt(Opt_uid_lt), tbuf); else seq_printf(m, pt(Opt_uid_eq), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_EUID) { snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->uid)); if (entry->uid_op == &uid_gt) seq_printf(m, pt(Opt_euid_gt), tbuf); else if (entry->uid_op == &uid_lt) seq_printf(m, pt(Opt_euid_lt), tbuf); else seq_printf(m, pt(Opt_euid_eq), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_FOWNER) { snprintf(tbuf, sizeof(tbuf), "%d", __kuid_val(entry->fowner)); if (entry->fowner_op == &uid_gt) seq_printf(m, pt(Opt_fowner_gt), tbuf); else if (entry->fowner_op == &uid_lt) seq_printf(m, pt(Opt_fowner_lt), tbuf); else seq_printf(m, pt(Opt_fowner_eq), tbuf); seq_puts(m, " "); } if (entry->flags & IMA_VALIDATE_ALGOS) { seq_puts(m, "appraise_algos="); ima_policy_show_appraise_algos(m, entry->allowed_algos); seq_puts(m, " "); } for (i = 0; i < MAX_LSM_RULES; i++) { if (entry->lsm[i].rule) { switch (i) { case LSM_OBJ_USER: seq_printf(m, pt(Opt_obj_user), entry->lsm[i].args_p); break; case LSM_OBJ_ROLE: seq_printf(m, pt(Opt_obj_role), entry->lsm[i].args_p); break; case LSM_OBJ_TYPE: seq_printf(m, pt(Opt_obj_type), entry->lsm[i].args_p); break; case LSM_SUBJ_USER: seq_printf(m, pt(Opt_subj_user), entry->lsm[i].args_p); break; case LSM_SUBJ_ROLE: seq_printf(m, pt(Opt_subj_role), entry->lsm[i].args_p); break; case LSM_SUBJ_TYPE: seq_printf(m, pt(Opt_subj_type), entry->lsm[i].args_p); break; } seq_puts(m, " "); } } if (entry->template) seq_printf(m, "template=%s ", entry->template->name); if (entry->flags & IMA_DIGSIG_REQUIRED) { if (entry->flags & IMA_MODSIG_ALLOWED) seq_puts(m, "appraise_type=imasig|modsig "); else seq_puts(m, "appraise_type=imasig "); } if (entry->flags & IMA_CHECK_BLACKLIST) seq_puts(m, "appraise_flag=check_blacklist "); if (entry->flags & IMA_PERMIT_DIRECTIO) seq_puts(m, "permit_directio "); rcu_read_unlock(); seq_puts(m, "\n"); return 0; } #endif /* CONFIG_IMA_READ_POLICY */ #if defined(CONFIG_IMA_APPRAISE) && defined(CONFIG_INTEGRITY_TRUSTED_KEYRING) /* * ima_appraise_signature: whether IMA will appraise a given function using * an IMA digital signature. This is restricted to cases where the kernel * has a set of built-in trusted keys in order to avoid an attacker simply * loading additional keys. */ bool ima_appraise_signature(enum kernel_read_file_id id) { struct ima_rule_entry *entry; bool found = false; enum ima_hooks func; struct list_head *ima_rules_tmp; if (id >= READING_MAX_ID) return false; if (id == READING_KEXEC_IMAGE && !(ima_appraise & IMA_APPRAISE_ENFORCE) && security_locked_down(LOCKDOWN_KEXEC)) return false; func = read_idmap[id] ?: FILE_CHECK; rcu_read_lock(); ima_rules_tmp = rcu_dereference(ima_rules); list_for_each_entry_rcu(entry, ima_rules_tmp, list) { if (entry->action != APPRAISE) continue; /* * A generic entry will match, but otherwise require that it * match the func we're looking for */ if (entry->func && entry->func != func) continue; /* * We require this to be a digital signature, not a raw IMA * hash. */ if (entry->flags & IMA_DIGSIG_REQUIRED) found = true; /* * We've found a rule that matches, so break now even if it * didn't require a digital signature - a later rule that does * won't override it, so would be a false positive. */ break; } rcu_read_unlock(); return found; } #endif /* CONFIG_IMA_APPRAISE && CONFIG_INTEGRITY_TRUSTED_KEYRING */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #ifndef _BR_PRIVATE_STP_H #define _BR_PRIVATE_STP_H #define BPDU_TYPE_CONFIG 0 #define BPDU_TYPE_TCN 0x80 /* IEEE 802.1D-1998 timer values */ #define BR_MIN_HELLO_TIME (1*HZ) #define BR_MAX_HELLO_TIME (10*HZ) #define BR_MIN_FORWARD_DELAY (2*HZ) #define BR_MAX_FORWARD_DELAY (30*HZ) #define BR_MIN_MAX_AGE (6*HZ) #define BR_MAX_MAX_AGE (40*HZ) #define BR_MIN_PATH_COST 1 #define BR_MAX_PATH_COST 65535 struct br_config_bpdu { unsigned int topology_change:1; unsigned int topology_change_ack:1; bridge_id root; int root_path_cost; bridge_id bridge_id; port_id port_id; int message_age; int max_age; int hello_time; int forward_delay; }; /* called under bridge lock */ static inline int br_is_designated_port(const struct net_bridge_port *p) { return !memcmp(&p->designated_bridge, &p->br->bridge_id, 8) && (p->designated_port == p->port_id); } /* br_stp.c */ void br_become_root_bridge(struct net_bridge *br); void br_config_bpdu_generation(struct net_bridge *); void br_configuration_update(struct net_bridge *); void br_port_state_selection(struct net_bridge *); void br_received_config_bpdu(struct net_bridge_port *p, const struct br_config_bpdu *bpdu); void br_received_tcn_bpdu(struct net_bridge_port *p); void br_transmit_config(struct net_bridge_port *p); void br_transmit_tcn(struct net_bridge *br); void br_topology_change_detection(struct net_bridge *br); void __br_set_topology_change(struct net_bridge *br, unsigned char val); /* br_stp_bpdu.c */ void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *); void br_send_tcn_bpdu(struct net_bridge_port *); #endif
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 // SPDX-License-Identifier: GPL-2.0 /* net/atm/pvc.c - ATM PVC sockets */ /* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ #include <linux/net.h> /* struct socket, struct proto_ops */ #include <linux/atm.h> /* ATM stuff */ #include <linux/atmdev.h> /* ATM devices */ #include <linux/errno.h> /* error codes */ #include <linux/kernel.h> /* printk */ #include <linux/init.h> #include <linux/skbuff.h> #include <linux/bitops.h> #include <linux/export.h> #include <net/sock.h> /* for sock_no_* */ #include "resources.h" /* devs and vccs */ #include "common.h" /* common for PVCs and SVCs */ static int pvc_shutdown(struct socket *sock, int how) { return 0; } static int pvc_bind(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len) { struct sock *sk = sock->sk; struct sockaddr_atmpvc *addr; struct atm_vcc *vcc; int error; if (sockaddr_len != sizeof(struct sockaddr_atmpvc)) return -EINVAL; addr = (struct sockaddr_atmpvc *)sockaddr; if (addr->sap_family != AF_ATMPVC) return -EAFNOSUPPORT; lock_sock(sk); vcc = ATM_SD(sock); if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { error = -EBADFD; goto out; } if (test_bit(ATM_VF_PARTIAL, &vcc->flags)) { if (vcc->vpi != ATM_VPI_UNSPEC) addr->sap_addr.vpi = vcc->vpi; if (vcc->vci != ATM_VCI_UNSPEC) addr->sap_addr.vci = vcc->vci; } error = vcc_connect(sock, addr->sap_addr.itf, addr->sap_addr.vpi, addr->sap_addr.vci); out: release_sock(sk); return error; } static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr, int sockaddr_len, int flags) { return pvc_bind(sock, sockaddr, sockaddr_len); } static int pvc_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int error; lock_sock(sk); error = vcc_setsockopt(sock, level, optname, optval, optlen); release_sock(sk); return error; } static int pvc_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int error; lock_sock(sk); error = vcc_getsockopt(sock, level, optname, optval, optlen); release_sock(sk); return error; } static int pvc_getname(struct socket *sock, struct sockaddr *sockaddr, int peer) { struct sockaddr_atmpvc *addr; struct atm_vcc *vcc = ATM_SD(sock); if (!vcc->dev || !test_bit(ATM_VF_ADDR, &vcc->flags)) return -ENOTCONN; addr = (struct sockaddr_atmpvc *)sockaddr; memset(addr, 0, sizeof(*addr)); addr->sap_family = AF_ATMPVC; addr->sap_addr.itf = vcc->dev->number; addr->sap_addr.vpi = vcc->vpi; addr->sap_addr.vci = vcc->vci; return sizeof(struct sockaddr_atmpvc); } static const struct proto_ops pvc_proto_ops = { .family = PF_ATMPVC, .owner = THIS_MODULE, .release = vcc_release, .bind = pvc_bind, .connect = pvc_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = pvc_getname, .poll = vcc_poll, .ioctl = vcc_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = vcc_compat_ioctl, #endif .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = pvc_shutdown, .setsockopt = pvc_setsockopt, .getsockopt = pvc_getsockopt, .sendmsg = vcc_sendmsg, .recvmsg = vcc_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, }; static int pvc_create(struct net *net, struct socket *sock, int protocol, int kern) { if (net != &init_net) return -EAFNOSUPPORT; sock->ops = &pvc_proto_ops; return vcc_create(net, sock, protocol, PF_ATMPVC, kern); } static const struct net_proto_family pvc_family_ops = { .family = PF_ATMPVC, .create = pvc_create, .owner = THIS_MODULE, }; /* * Initialize the ATM PVC protocol family */ int __init atmpvc_init(void) { return sock_register(&pvc_family_ops); } void atmpvc_exit(void) { sock_unregister(PF_ATMPVC); }
26 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 // SPDX-License-Identifier: GPL-2.0-only /* * VHT handling * * Portions of this file * Copyright(c) 2015 - 2016 Intel Deutschland GmbH * Copyright (C) 2018 - 2020 Intel Corporation */ #include <linux/ieee80211.h> #include <linux/export.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "rate.h" static void __check_vhtcap_disable(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap, u32 flag) { __le32 le_flag = cpu_to_le32(flag); if (sdata->u.mgd.vht_capa_mask.vht_cap_info & le_flag && !(sdata->u.mgd.vht_capa.vht_cap_info & le_flag)) vht_cap->cap &= ~flag; } void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata, struct ieee80211_sta_vht_cap *vht_cap) { int i; u16 rxmcs_mask, rxmcs_cap, rxmcs_n, txmcs_mask, txmcs_cap, txmcs_n; if (!vht_cap->vht_supported) return; if (sdata->vif.type != NL80211_IFTYPE_STATION) return; __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_RXLDPC); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SHORT_GI_80); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SHORT_GI_160); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_TXSTBC); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN); __check_vhtcap_disable(sdata, vht_cap, IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN); /* Allow user to decrease AMPDU length exponent */ if (sdata->u.mgd.vht_capa_mask.vht_cap_info & cpu_to_le32(IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK)) { u32 cap, n; n = le32_to_cpu(sdata->u.mgd.vht_capa.vht_cap_info) & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; n >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; cap = vht_cap->cap & IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; cap >>= IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; if (n < cap) { vht_cap->cap &= ~IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK; vht_cap->cap |= n << IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_SHIFT; } } /* Allow the user to decrease MCSes */ rxmcs_mask = le16_to_cpu(sdata->u.mgd.vht_capa_mask.supp_mcs.rx_mcs_map); rxmcs_n = le16_to_cpu(sdata->u.mgd.vht_capa.supp_mcs.rx_mcs_map); rxmcs_n &= rxmcs_mask; rxmcs_cap = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map); txmcs_mask = le16_to_cpu(sdata->u.mgd.vht_capa_mask.supp_mcs.tx_mcs_map); txmcs_n = le16_to_cpu(sdata->u.mgd.vht_capa.supp_mcs.tx_mcs_map); txmcs_n &= txmcs_mask; txmcs_cap = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map); for (i = 0; i < 8; i++) { u8 m, n, c; m = (rxmcs_mask >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; n = (rxmcs_n >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; c = (rxmcs_cap >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; if (m && ((c != IEEE80211_VHT_MCS_NOT_SUPPORTED && n < c) || n == IEEE80211_VHT_MCS_NOT_SUPPORTED)) { rxmcs_cap &= ~(3 << 2*i); rxmcs_cap |= (rxmcs_n & (3 << 2*i)); } m = (txmcs_mask >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; n = (txmcs_n >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; c = (txmcs_cap >> 2*i) & IEEE80211_VHT_MCS_NOT_SUPPORTED; if (m && ((c != IEEE80211_VHT_MCS_NOT_SUPPORTED && n < c) || n == IEEE80211_VHT_MCS_NOT_SUPPORTED)) { txmcs_cap &= ~(3 << 2*i); txmcs_cap |= (txmcs_n & (3 << 2*i)); } } vht_cap->vht_mcs.rx_mcs_map = cpu_to_le16(rxmcs_cap); vht_cap->vht_mcs.tx_mcs_map = cpu_to_le16(txmcs_cap); } void ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, const struct ieee80211_vht_cap *vht_cap_ie, struct sta_info *sta) { struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; struct ieee80211_sta_vht_cap own_cap; u32 cap_info, i; bool have_80mhz; memset(vht_cap, 0, sizeof(*vht_cap)); if (!sta->sta.ht_cap.ht_supported) return; if (!vht_cap_ie || !sband->vht_cap.vht_supported) return; /* Allow VHT if at least one channel on the sband supports 80 MHz */ have_80mhz = false; for (i = 0; i < sband->n_channels; i++) { if (sband->channels[i].flags & (IEEE80211_CHAN_DISABLED | IEEE80211_CHAN_NO_80MHZ)) continue; have_80mhz = true; break; } if (!have_80mhz) return; /* * A VHT STA must support 40 MHz, but if we verify that here * then we break a few things - some APs (e.g. Netgear R6300v2 * and others based on the BCM4360 chipset) will unset this * capability bit when operating in 20 MHz. */ vht_cap->vht_supported = true; own_cap = sband->vht_cap; /* * If user has specified capability overrides, take care * of that if the station we're setting up is the AP that * we advertised a restricted capability set to. Override * our own capabilities and then use those below. */ if (sdata->vif.type == NL80211_IFTYPE_STATION && !test_sta_flag(sta, WLAN_STA_TDLS_PEER)) ieee80211_apply_vhtcap_overrides(sdata, &own_cap); /* take some capabilities as-is */ cap_info = le32_to_cpu(vht_cap_ie->vht_cap_info); vht_cap->cap = cap_info; vht_cap->cap &= IEEE80211_VHT_CAP_RXLDPC | IEEE80211_VHT_CAP_VHT_TXOP_PS | IEEE80211_VHT_CAP_HTC_VHT | IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK | IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB | IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB | IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN | IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN; vht_cap->cap |= min_t(u32, cap_info & IEEE80211_VHT_CAP_MAX_MPDU_MASK, own_cap.cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK); /* and some based on our own capabilities */ switch (own_cap.cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ; break; case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; break; default: /* nothing */ break; } /* symmetric capabilities */ vht_cap->cap |= cap_info & own_cap.cap & (IEEE80211_VHT_CAP_SHORT_GI_80 | IEEE80211_VHT_CAP_SHORT_GI_160); /* remaining ones */ if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE) vht_cap->cap |= cap_info & (IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE | IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MASK); if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE) vht_cap->cap |= cap_info & (IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE | IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK); if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE; if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE; if (own_cap.cap & IEEE80211_VHT_CAP_TXSTBC) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_RXSTBC_MASK; if (own_cap.cap & IEEE80211_VHT_CAP_RXSTBC_MASK) vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_TXSTBC; /* Copy peer MCS info, the driver might need them. */ memcpy(&vht_cap->vht_mcs, &vht_cap_ie->supp_mcs, sizeof(struct ieee80211_vht_mcs_info)); /* copy EXT_NSS_BW Support value or remove the capability */ if (ieee80211_hw_check(&sdata->local->hw, SUPPORTS_VHT_EXT_NSS_BW)) vht_cap->cap |= (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK); else vht_cap->vht_mcs.tx_highest &= ~cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE); /* but also restrict MCSes */ for (i = 0; i < 8; i++) { u16 own_rx, own_tx, peer_rx, peer_tx; own_rx = le16_to_cpu(own_cap.vht_mcs.rx_mcs_map); own_rx = (own_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; own_tx = le16_to_cpu(own_cap.vht_mcs.tx_mcs_map); own_tx = (own_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; peer_rx = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map); peer_rx = (peer_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; peer_tx = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map); peer_tx = (peer_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; if (peer_tx != IEEE80211_VHT_MCS_NOT_SUPPORTED) { if (own_rx == IEEE80211_VHT_MCS_NOT_SUPPORTED) peer_tx = IEEE80211_VHT_MCS_NOT_SUPPORTED; else if (own_rx < peer_tx) peer_tx = own_rx; } if (peer_rx != IEEE80211_VHT_MCS_NOT_SUPPORTED) { if (own_tx == IEEE80211_VHT_MCS_NOT_SUPPORTED) peer_rx = IEEE80211_VHT_MCS_NOT_SUPPORTED; else if (own_tx < peer_rx) peer_rx = own_tx; } vht_cap->vht_mcs.rx_mcs_map &= ~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2); vht_cap->vht_mcs.rx_mcs_map |= cpu_to_le16(peer_rx << i * 2); vht_cap->vht_mcs.tx_mcs_map &= ~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2); vht_cap->vht_mcs.tx_mcs_map |= cpu_to_le16(peer_tx << i * 2); } /* * This is a workaround for VHT-enabled STAs which break the spec * and have the VHT-MCS Rx map filled in with value 3 for all eight * spacial streams, an example is AR9462. * * As per spec, in section 22.1.1 Introduction to the VHT PHY * A VHT STA shall support at least single spactial stream VHT-MCSs * 0 to 7 (transmit and receive) in all supported channel widths. */ if (vht_cap->vht_mcs.rx_mcs_map == cpu_to_le16(0xFFFF)) { vht_cap->vht_supported = false; sdata_info(sdata, "Ignoring VHT IE from %pM due to invalid rx_mcs_map\n", sta->addr); return; } /* finally set up the bandwidth */ switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) { case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ: case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ: sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; default: sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; if (!(vht_cap->vht_mcs.tx_highest & cpu_to_le16(IEEE80211_VHT_EXT_NSS_BW_CAPABLE))) break; /* * If this is non-zero, then it does support 160 MHz after all, * in one form or the other. We don't distinguish here (or even * above) between 160 and 80+80 yet. */ if (cap_info & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; } sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta); switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) { case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454: sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454; break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991: sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991; break; case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895: default: sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895; break; } } /* FIXME: move this to some better location - parses HE now */ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta) { struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; u32 cap_width; if (he_cap->has_he) { u8 info = he_cap->he_cap_elem.phy_cap_info[0]; if (sta->sdata->vif.bss_conf.chandef.chan->band == NL80211_BAND_2GHZ) { if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_IN_2G) return IEEE80211_STA_RX_BW_40; else return IEEE80211_STA_RX_BW_20; } if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G || info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_80PLUS80_MHZ_IN_5G) return IEEE80211_STA_RX_BW_160; else if (info & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_40MHZ_80MHZ_IN_5G) return IEEE80211_STA_RX_BW_80; return IEEE80211_STA_RX_BW_20; } if (!vht_cap->vht_supported) return sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? IEEE80211_STA_RX_BW_40 : IEEE80211_STA_RX_BW_20; cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ || cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return IEEE80211_STA_RX_BW_160; /* * If this is non-zero, then it does support 160 MHz after all, * in one form or the other. We don't distinguish here (or even * above) between 160 and 80+80 yet. */ if (vht_cap->cap & IEEE80211_VHT_CAP_EXT_NSS_BW_MASK) return IEEE80211_STA_RX_BW_160; return IEEE80211_STA_RX_BW_80; } enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta) { struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; u32 cap_width; if (!vht_cap->vht_supported) { if (!sta->sta.ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20_NOHT; return sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ? NL80211_CHAN_WIDTH_40 : NL80211_CHAN_WIDTH_20; } cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) return NL80211_CHAN_WIDTH_160; else if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ) return NL80211_CHAN_WIDTH_80P80; return NL80211_CHAN_WIDTH_80; } enum nl80211_chan_width ieee80211_sta_rx_bw_to_chan_width(struct sta_info *sta) { enum ieee80211_sta_rx_bandwidth cur_bw = sta->sta.bandwidth; struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap; u32 cap_width; switch (cur_bw) { case IEEE80211_STA_RX_BW_20: if (!sta->sta.ht_cap.ht_supported) return NL80211_CHAN_WIDTH_20_NOHT; else return NL80211_CHAN_WIDTH_20; case IEEE80211_STA_RX_BW_40: return NL80211_CHAN_WIDTH_40; case IEEE80211_STA_RX_BW_80: return NL80211_CHAN_WIDTH_80; case IEEE80211_STA_RX_BW_160: cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK; if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ) return NL80211_CHAN_WIDTH_160; return NL80211_CHAN_WIDTH_80P80; default: return NL80211_CHAN_WIDTH_20; } } enum ieee80211_sta_rx_bandwidth ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width) { switch (width) { case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: return IEEE80211_STA_RX_BW_20; case NL80211_CHAN_WIDTH_40: return IEEE80211_STA_RX_BW_40; case NL80211_CHAN_WIDTH_80: return IEEE80211_STA_RX_BW_80; case NL80211_CHAN_WIDTH_160: case NL80211_CHAN_WIDTH_80P80: return IEEE80211_STA_RX_BW_160; default: WARN_ON_ONCE(1); return IEEE80211_STA_RX_BW_20; } } /* FIXME: rename/move - this deals with everything not just VHT */ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; enum ieee80211_sta_rx_bandwidth bw; enum nl80211_chan_width bss_width = sdata->vif.bss_conf.chandef.width; bw = ieee80211_sta_cap_rx_bw(sta); bw = min(bw, sta->cur_max_bandwidth); /* Don't consider AP's bandwidth for TDLS peers, section 11.23.1 of * IEEE80211-2016 specification makes higher bandwidth operation * possible on the TDLS link if the peers have wider bandwidth * capability. * * However, in this case, and only if the TDLS peer is authorized, * limit to the tdls_chandef so that the configuration here isn't * wider than what's actually requested on the channel context. */ if (test_sta_flag(sta, WLAN_STA_TDLS_PEER) && test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW) && test_sta_flag(sta, WLAN_STA_AUTHORIZED) && sta->tdls_chandef.chan) bw = min(bw, ieee80211_chan_width_to_rx_bw(sta->tdls_chandef.width)); else bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width)); return bw; } void ieee80211_sta_set_rx_nss(struct sta_info *sta) { u8 ht_rx_nss = 0, vht_rx_nss = 0, he_rx_nss = 0, rx_nss; bool support_160; /* if we received a notification already don't overwrite it */ if (sta->sta.rx_nss) return; if (sta->sta.he_cap.has_he) { int i; u8 rx_mcs_80 = 0, rx_mcs_160 = 0; const struct ieee80211_sta_he_cap *he_cap = &sta->sta.he_cap; u16 mcs_160_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_160); u16 mcs_80_map = le16_to_cpu(he_cap->he_mcs_nss_supp.rx_mcs_80); for (i = 7; i >= 0; i--) { u8 mcs_160 = (mcs_160_map >> (2 * i)) & 3; if (mcs_160 != IEEE80211_VHT_MCS_NOT_SUPPORTED) { rx_mcs_160 = i + 1; break; } } for (i = 7; i >= 0; i--) { u8 mcs_80 = (mcs_80_map >> (2 * i)) & 3; if (mcs_80 != IEEE80211_VHT_MCS_NOT_SUPPORTED) { rx_mcs_80 = i + 1; break; } } support_160 = he_cap->he_cap_elem.phy_cap_info[0] & IEEE80211_HE_PHY_CAP0_CHANNEL_WIDTH_SET_160MHZ_IN_5G; if (support_160) he_rx_nss = min(rx_mcs_80, rx_mcs_160); else he_rx_nss = rx_mcs_80; } if (sta->sta.ht_cap.ht_supported) { if (sta->sta.ht_cap.mcs.rx_mask[0]) ht_rx_nss++; if (sta->sta.ht_cap.mcs.rx_mask[1]) ht_rx_nss++; if (sta->sta.ht_cap.mcs.rx_mask[2]) ht_rx_nss++; if (sta->sta.ht_cap.mcs.rx_mask[3]) ht_rx_nss++; /* FIXME: consider rx_highest? */ } if (sta->sta.vht_cap.vht_supported) { int i; u16 rx_mcs_map; rx_mcs_map = le16_to_cpu(sta->sta.vht_cap.vht_mcs.rx_mcs_map); for (i = 7; i >= 0; i--) { u8 mcs = (rx_mcs_map >> (2 * i)) & 3; if (mcs != IEEE80211_VHT_MCS_NOT_SUPPORTED) { vht_rx_nss = i + 1; break; } } /* FIXME: consider rx_highest? */ } rx_nss = max(vht_rx_nss, ht_rx_nss); rx_nss = max(he_rx_nss, rx_nss); sta->sta.rx_nss = max_t(u8, 1, rx_nss); } u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, enum nl80211_band band) { enum ieee80211_sta_rx_bandwidth new_bw; struct sta_opmode_info sta_opmode = {}; u32 changed = 0; u8 nss; /* ignore - no support for BF yet */ if (opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_TYPE_BF) return 0; nss = opmode & IEEE80211_OPMODE_NOTIF_RX_NSS_MASK; nss >>= IEEE80211_OPMODE_NOTIF_RX_NSS_SHIFT; nss += 1; if (sta->sta.rx_nss != nss) { sta->sta.rx_nss = nss; sta_opmode.rx_nss = nss; changed |= IEEE80211_RC_NSS_CHANGED; sta_opmode.changed |= STA_OPMODE_N_SS_CHANGED; } switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) { case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ: /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_40MHZ: /* ignore IEEE80211_OPMODE_NOTIF_BW_160_80P80 must not be set */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_40; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_80MHZ: if (opmode & IEEE80211_OPMODE_NOTIF_BW_160_80P80) sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; else sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_80; break; case IEEE80211_OPMODE_NOTIF_CHANWIDTH_160MHZ: /* legacy only, no longer used by newer spec */ sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_160; break; } new_bw = ieee80211_sta_cur_vht_bw(sta); if (new_bw != sta->sta.bandwidth) { sta->sta.bandwidth = new_bw; sta_opmode.bw = ieee80211_sta_rx_bw_to_chan_width(sta); changed |= IEEE80211_RC_BW_CHANGED; sta_opmode.changed |= STA_OPMODE_MAX_BW_CHANGED; } if (sta_opmode.changed) cfg80211_sta_opmode_change_notify(sdata->dev, sta->addr, &sta_opmode, GFP_KERNEL); return changed; } void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt) { struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; if (!sdata->vif.mu_mimo_owner) return; if (!memcmp(mgmt->u.action.u.vht_group_notif.position, bss_conf->mu_group.position, WLAN_USER_POSITION_LEN) && !memcmp(mgmt->u.action.u.vht_group_notif.membership, bss_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN)) return; memcpy(bss_conf->mu_group.membership, mgmt->u.action.u.vht_group_notif.membership, WLAN_MEMBERSHIP_LEN); memcpy(bss_conf->mu_group.position, mgmt->u.action.u.vht_group_notif.position, WLAN_USER_POSITION_LEN); ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MU_GROUPS); } void ieee80211_update_mu_groups(struct ieee80211_vif *vif, const u8 *membership, const u8 *position) { struct ieee80211_bss_conf *bss_conf = &vif->bss_conf; if (WARN_ON_ONCE(!vif->mu_mimo_owner)) return; memcpy(bss_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN); memcpy(bss_conf->mu_group.position, position, WLAN_USER_POSITION_LEN); } EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups); void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, u8 opmode, enum nl80211_band band) { struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band); if (changed > 0) { ieee80211_recalc_min_chandef(sdata); rate_control_rate_update(local, sband, sta, changed); } } void ieee80211_get_vht_mask_from_cap(__le16 vht_cap, u16 vht_mask[NL80211_VHT_NSS_MAX]) { int i; u16 mask, cap = le16_to_cpu(vht_cap); for (i = 0; i < NL80211_VHT_NSS_MAX; i++) { mask = (cap >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED; switch (mask) { case IEEE80211_VHT_MCS_SUPPORT_0_7: vht_mask[i] = 0x00FF; break; case IEEE80211_VHT_MCS_SUPPORT_0_8: vht_mask[i] = 0x01FF; break; case IEEE80211_VHT_MCS_SUPPORT_0_9: vht_mask[i] = 0x03FF; break; case IEEE80211_VHT_MCS_NOT_SUPPORTED: default: vht_mask[i] = 0; break; } } }
5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 // SPDX-License-Identifier: GPL-2.0 /* * bio-integrity.c - bio data integrity extensions * * Copyright (C) 2007, 2008, 2009 Oracle Corporation * Written by: Martin K. Petersen <martin.petersen@oracle.com> */ #include <linux/blkdev.h> #include <linux/mempool.h> #include <linux/export.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/slab.h> #include "blk.h" static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; void blk_flush_integrity(void) { flush_workqueue(kintegrityd_wq); } static void __bio_integrity_free(struct bio_set *bs, struct bio_integrity_payload *bip) { if (bs && mempool_initialized(&bs->bio_integrity_pool)) { if (bip->bip_vec) bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, bip->bip_max_vcnt); mempool_free(bip, &bs->bio_integrity_pool); } else { kfree(bip); } } /** * bio_integrity_alloc - Allocate integrity payload and attach it to bio * @bio: bio to attach integrity metadata to * @gfp_mask: Memory allocation mask * @nr_vecs: Number of integrity metadata scatter-gather elements * * Description: This function prepares a bio for attaching integrity * metadata. nr_vecs specifies the maximum number of pages containing * integrity metadata that can be attached. */ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, gfp_t gfp_mask, unsigned int nr_vecs) { struct bio_integrity_payload *bip; struct bio_set *bs = bio->bi_pool; unsigned inline_vecs; if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) return ERR_PTR(-EOPNOTSUPP); if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask); inline_vecs = nr_vecs; } else { bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); inline_vecs = BIO_INLINE_VECS; } if (unlikely(!bip)) return ERR_PTR(-ENOMEM); memset(bip, 0, sizeof(*bip)); if (nr_vecs > inline_vecs) { bip->bip_max_vcnt = nr_vecs; bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; } else { bip->bip_vec = bip->bip_inline_vecs; bip->bip_max_vcnt = inline_vecs; } bip->bip_bio = bio; bio->bi_integrity = bip; bio->bi_opf |= REQ_INTEGRITY; return bip; err: __bio_integrity_free(bs, bip); return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_integrity_alloc); /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed * * Description: Used to free the integrity portion of a bio. Usually * called from bio_free(). */ void bio_integrity_free(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_set *bs = bio->bi_pool; if (bip->bip_flags & BIP_BLOCK_INTEGRITY) kfree(bvec_virt(bip->bip_vec)); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; bio->bi_opf &= ~REQ_INTEGRITY; } /** * bio_integrity_add_page - Attach integrity metadata * @bio: bio to update * @page: page containing integrity metadata * @len: number of bytes of integrity metadata in page * @offset: start offset within page * * Description: Attach a page containing integrity metadata to bio. */ int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_vec *iv; if (bip->bip_vcnt >= bip->bip_max_vcnt) { printk(KERN_ERR "%s: bip_vec full\n", __func__); return 0; } iv = bip->bip_vec + bip->bip_vcnt; if (bip->bip_vcnt && bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue, &bip->bip_vec[bip->bip_vcnt - 1], offset)) return 0; iv->bv_page = page; iv->bv_len = len; iv->bv_offset = offset; bip->bip_vcnt++; return len; } EXPORT_SYMBOL(bio_integrity_add_page); /** * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for * @proc_iter: iterator to process * @proc_fn: Pointer to the relevant processing function */ static blk_status_t bio_integrity_process(struct bio *bio, struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct blk_integrity_iter iter; struct bvec_iter bviter; struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); blk_status_t ret = BLK_STS_OK; iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; iter.seed = proc_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); __bio_for_each_segment(bv, bio, bviter, *proc_iter) { void *kaddr = bvec_kmap_local(&bv); iter.data_buf = kaddr; iter.data_size = bv.bv_len; ret = proc_fn(&iter); kunmap_local(kaddr); if (ret) break; } return ret; } /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare * * Description: Checks if the bio already has an integrity payload attached. * If it does, the payload has been generated by another kernel subsystem, * and we just pass it through. Otherwise allocates integrity payload. * The bio must have data direction, target device and start sector set priot * to calling. In the WRITE case, integrity metadata will be generated using * the block device's integrity function. In the READ case, the buffer * will be prepared for DMA and a suitable end_io handler set up. */ bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); void *buf; unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; gfp_t gfp = GFP_NOIO; if (!bi) return true; if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) return true; if (!bio_sectors(bio)) return true; /* Already protected? */ if (bio_integrity(bio)) return true; if (bio_data_dir(bio) == READ) { if (!bi->profile->verify_fn || !(bi->flags & BLK_INTEGRITY_VERIFY)) return true; } else { if (!bi->profile->generate_fn || !(bi->flags & BLK_INTEGRITY_GENERATE)) return true; /* * Zero the memory allocated to not leak uninitialized kernel * memory to disk. For PI this only affects the app tag, but * for non-integrity metadata it affects the entire metadata * buffer. */ gfp |= __GFP_ZERO; } /* Allocate kernel buffer for protection data */ len = bio_integrity_bytes(bi, bio_sectors(bio)); buf = kmalloc(len, gfp); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); goto err_end_io; } end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; start = ((unsigned long) buf) >> PAGE_SHIFT; nr_pages = end - start; /* Allocate bio integrity payload and integrity vectors */ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); goto err_end_io; } bip->bip_flags |= BIP_BLOCK_INTEGRITY; bip->bip_iter.bi_size = len; bip_set_seed(bip, bio->bi_iter.bi_sector); if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) bip->bip_flags |= BIP_IP_CHECKSUM; /* Map it */ offset = offset_in_page(buf); for (i = 0 ; i < nr_pages ; i++) { int ret; bytes = PAGE_SIZE - offset; if (len <= 0) break; if (bytes > len) bytes = len; ret = bio_integrity_add_page(bio, virt_to_page(buf), bytes, offset); if (ret == 0) { printk(KERN_ERR "could not attach integrity payload\n"); goto err_end_io; } if (ret < bytes) break; buf += bytes; len -= bytes; offset = 0; } /* Auto-generate integrity metadata if this is a write */ if (bio_data_dir(bio) == WRITE) { bio_integrity_process(bio, &bio->bi_iter, bi->profile->generate_fn); } else { bip->bio_iter = bio->bi_iter; } return true; err_end_io: bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; } EXPORT_SYMBOL(bio_integrity_prep); /** * bio_integrity_verify_fn - Integrity I/O completion worker * @work: Work struct stored in bio to be verified * * Description: This workqueue function is called to complete a READ * request. The function verifies the transferred integrity metadata * and then calls the original bio end_io function. */ static void bio_integrity_verify_fn(struct work_struct *work) { struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); /* * At the moment verify is called bio's iterator was advanced * during split and completion, we need to rewind iterator to * it's original position. */ bio->bi_status = bio_integrity_process(bio, &bip->bio_iter, bi->profile->verify_fn); bio_integrity_free(bio); bio_endio(bio); } /** * __bio_integrity_endio - Integrity I/O completion function * @bio: Protected bio * * Description: Completion for integrity I/O * * Normally I/O completion is done in interrupt context. However, * verifying I/O integrity is a time-consuming task which must be run * in process context. This function postpones completion * accordingly. */ bool __bio_integrity_endio(struct bio *bio) { struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->profile->verify_fn) { INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bip->bip_work); return false; } bio_integrity_free(bio); return true; } /** * bio_integrity_advance - Advance integrity vector * @bio: bio whose integrity vector to update * @bytes_done: number of data bytes that have been completed * * Description: This function calculates how many integrity bytes the * number of completed data bytes correspond to and advances the * integrity vector accordingly. */ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { struct bio_integrity_payload *bip = bio_integrity(bio); struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9); bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } /** * bio_integrity_trim - Trim integrity vector * @bio: bio whose integrity vector to update * * Description: Used to trim the integrity vector in a cloned bio. */ void bio_integrity_trim(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); } EXPORT_SYMBOL(bio_integrity_trim); /** * bio_integrity_clone - Callback for cloning bios with integrity metadata * @bio: New bio * @bio_src: Original bio * @gfp_mask: Memory allocation mask * * Description: Called to allocate a bip when cloning a bio */ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) { struct bio_integrity_payload *bip_src = bio_integrity(bio_src); struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); if (IS_ERR(bip)) return PTR_ERR(bip); memcpy(bip->bip_vec, bip_src->bip_vec, bip_src->bip_vcnt * sizeof(struct bio_vec)); bip->bip_vcnt = bip_src->bip_vcnt; bip->bip_iter = bip_src->bip_iter; bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; return 0; } EXPORT_SYMBOL(bio_integrity_clone); int bioset_integrity_create(struct bio_set *bs, int pool_size) { if (mempool_initialized(&bs->bio_integrity_pool)) return 0; if (mempool_init_slab_pool(&bs->bio_integrity_pool, pool_size, bip_slab)) return -1; if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) { mempool_exit(&bs->bio_integrity_pool); return -1; } return 0; } EXPORT_SYMBOL(bioset_integrity_create); void bioset_integrity_free(struct bio_set *bs) { mempool_exit(&bs->bio_integrity_pool); mempool_exit(&bs->bvec_integrity_pool); } void __init bio_integrity_init(void) { /* * kintegrityd won't block much but may burn a lot of CPU cycles. * Make it highpri CPU intensive wq with max concurrency of 1. */ kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); if (!kintegrityd_wq) panic("Failed to create kintegrityd\n"); bip_slab = kmem_cache_create("bio_integrity_payload", sizeof(struct bio_integrity_payload) + sizeof(struct bio_vec) * BIO_INLINE_VECS, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); }
4859 4862 76 75 73 73 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 // SPDX-License-Identifier: GPL-2.0 /* * Fast batching percpu counters. */ #include <linux/percpu_counter.h> #include <linux/mutex.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/debugobjects.h> #ifdef CONFIG_HOTPLUG_CPU static LIST_HEAD(percpu_counters); static DEFINE_SPINLOCK(percpu_counters_lock); #endif #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER static const struct debug_obj_descr percpu_counter_debug_descr; static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state) { struct percpu_counter *fbc = addr; switch (state) { case ODEBUG_STATE_ACTIVE: percpu_counter_destroy(fbc); debug_object_free(fbc, &percpu_counter_debug_descr); return true; default: return false; } } static const struct debug_obj_descr percpu_counter_debug_descr = { .name = "percpu_counter", .fixup_free = percpu_counter_fixup_free, }; static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { debug_object_init(fbc, &percpu_counter_debug_descr); debug_object_activate(fbc, &percpu_counter_debug_descr); } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { debug_object_deactivate(fbc, &percpu_counter_debug_descr); debug_object_free(fbc, &percpu_counter_debug_descr); } #else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { } #endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_set); /* * This function is both preempt and irq safe. The former is due to explicit * preemption disable. The latter is guaranteed by the fact that the slow path * is explicitly protected by an irq-safe spinlock whereas the fast patch uses * this_cpu_add which is irq-safe by definition. Hence there is no need muck * with irq state before calling this one */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; preempt_disable(); count = __this_cpu_read(*fbc->counters) + amount; if (abs(count) >= batch) { unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); fbc->count += count; __this_cpu_sub(*fbc->counters, count - amount); raw_spin_unlock_irqrestore(&fbc->lock, flags); } else { this_cpu_add(*fbc->counters, amount); } preempt_enable(); } EXPORT_SYMBOL(percpu_counter_add_batch); /* * For percpu_counter with a big batch, the devication of its count could * be big, and there is requirement to reduce the deviation, like when the * counter's batch could be runtime decreased to get a better accuracy, * which can be achieved by running this sync function on each CPU. */ void percpu_counter_sync(struct percpu_counter *fbc) { unsigned long flags; s64 count; raw_spin_lock_irqsave(&fbc->lock, flags); count = __this_cpu_read(*fbc->counters); fbc->count += count; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_sync); /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive() */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key) { unsigned long flags __maybe_unused; raw_spin_lock_init(&fbc->lock); lockdep_set_class(&fbc->lock, key); fbc->count = amount; fbc->counters = alloc_percpu_gfp(s32, gfp); if (!fbc->counters) return -ENOMEM; debug_percpu_counter_activate(fbc); #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc->list); spin_lock_irqsave(&percpu_counters_lock, flags); list_add(&fbc->list, &percpu_counters); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } EXPORT_SYMBOL(__percpu_counter_init); void percpu_counter_destroy(struct percpu_counter *fbc) { unsigned long flags __maybe_unused; if (!fbc->counters) return; debug_percpu_counter_deactivate(fbc); #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); list_del(&fbc->list); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif free_percpu(fbc->counters); fbc->counters = NULL; } EXPORT_SYMBOL(percpu_counter_destroy); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); static int compute_batch_value(unsigned int cpu) { int nr = num_online_cpus(); percpu_counter_batch = max(32, nr*2); return 0; } static int percpu_counter_cpu_dead(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU struct percpu_counter *fbc; compute_batch_value(cpu); spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; raw_spin_lock(&fbc->lock); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; raw_spin_unlock(&fbc->lock); } spin_unlock_irq(&percpu_counters_lock); #endif return 0; } /* * Compare counter against given value. * Return 1 if greater, 0 if equal and -1 if less */ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { s64 count; count = percpu_counter_read(fbc); /* Check to see if rough count will be sufficient for comparison */ if (abs(count - rhs) > (batch * num_online_cpus())) { if (count > rhs) return 1; else return -1; } /* Need to use precise count */ count = percpu_counter_sum(fbc); if (count > rhs) return 1; else if (count < rhs) return -1; else return 0; } EXPORT_SYMBOL(__percpu_counter_compare); static int __init percpu_counter_startup(void) { int ret; ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online", compute_batch_value, NULL); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD, "lib/percpu_cnt:dead", NULL, percpu_counter_cpu_dead); WARN_ON(ret < 0); return 0; } module_init(percpu_counter_startup);
41 42 42 42 65 65 41 42 42 42 41 41 42 41 42 42 42 42 42 42 42 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/super.c * * Copyright (C) 1991, 1992 Linus Torvalds * * super.c contains code to handle: - mount structures * - super-block tables * - filesystem drivers list * - mount system call * - umount system call * - ustat system call * * GK 2/5/95 - Changed to support mounting the root fs via NFS * * Added kerneld support: Jacques Gelinas and Bjorn Ekwall * Added change_root: Werner Almesberger & Hans Lermen, Feb '96 * Added options to /proc/mounts: * Torbjörn Lindh (torbjorn.lindh@gopta.se), April 14, 1996. * Added devfs support: Richard Gooch <rgooch@atnf.csiro.au>, 13-JAN-1998 * Heavily rewritten for 'one fs - one tree' dcache architecture. AV, Mar 2000 */ #include <linux/export.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/mount.h> #include <linux/security.h> #include <linux/writeback.h> /* for the emergency remount stuff */ #include <linux/idr.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/rculist_bl.h> #include <linux/cleancache.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/lockdep.h> #include <linux/user_namespace.h> #include <linux/fs_context.h> #include <uapi/linux/mount.h> #include "internal.h" static int thaw_super_locked(struct super_block *sb); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_writers", "sb_pagefaults", "sb_internal", }; /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long fs_objects = 0; long total_objects; long freed = 0; long dentries; long inodes; sb = container_of(shrink, struct super_block, s_shrink); /* * Deadlock avoidance. We may hold various FS locks, and we don't want * to recurse into the FS that called us in clear_inode() and friends.. */ if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; if (!trylock_super(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb, sc); inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects = dentries + inodes + fs_objects + 1; if (!total_objects) total_objects = 1; /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects); /* * prune the dcache first as the icache is pinned by it, then * prune the icache, followed by the filesystem specific caches * * Ensure that we always scan at least one object - memcg kmem * accounting uses this to fully empty the caches. */ sc->nr_to_scan = dentries + 1; freed = prune_dcache_sb(sb, sc); sc->nr_to_scan = inodes + 1; freed += prune_icache_sb(sb, sc); if (fs_objects) { sc->nr_to_scan = fs_objects + 1; freed += sb->s_op->free_cached_objects(sb, sc); } up_read(&sb->s_umount); return freed; } static unsigned long super_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct super_block *sb; long total_objects = 0; sb = container_of(shrink, struct super_block, s_shrink); /* * We don't call trylock_super() here as it is a scalability bottleneck, * so we're exposed to partial setup state. The shrinker rwsem does not * protect filesystem operations backing list_lru_shrink_count() or * s_op->nr_cached_objects(). Counts can change between * super_cache_count and super_cache_scan, so we really don't need locks * here. * * However, if we are currently mounting the superblock, the underlying * filesystem might be in a state of partial construction and hence it * is dangerous to access it. trylock_super() uses a SB_BORN check to * avoid this situation, so do the same here. The memory barrier is * matched with the one in mount_fs() as we don't hold locks here. */ if (!(sb->s_flags & SB_BORN)) return 0; smp_rmb(); if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc); total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc); if (!total_objects) return SHRINK_EMPTY; total_objects = vfs_pressure_ratio(total_objects); return total_objects; } static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); int i; for (i = 0; i < SB_FREEZE_LEVELS; i++) percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } static void destroy_super_rcu(struct rcu_head *head) { struct super_block *s = container_of(head, struct super_block, rcu); INIT_WORK(&s->destroy_work, destroy_super_work); schedule_work(&s->destroy_work); } /* Free a superblock that has never been seen by anyone */ static void destroy_unused_super(struct super_block *s) { if (!s) return; up_write(&s->s_umount); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); free_prealloced_shrinker(&s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } /** * alloc_super - create new superblock * @type: filesystem type superblock should belong to * @flags: the mount flags * @user_ns: User namespace for the super_block * * Allocates and initializes a new &struct super_block. alloc_super() * returns a pointer new superblock or %NULL if allocation had failed. */ static struct super_block *alloc_super(struct file_system_type *type, int flags, struct user_namespace *user_ns) { struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER); static const struct super_operations default_op; int i; if (!s) return NULL; INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); /* * sget() can have s_umount recursion. * * When it cannot find a suitable sb, it allocates a new * one (this one), and tries again to find a suitable old * one. * * In case that succeeds, it will acquire the s_umount * lock of the old one. Since these are clearly distrinct * locks, and this object isn't exposed yet, there's no * risk of deadlocks. * * Annotate this by putting this lock in a different * subclass. */ down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); if (security_sb_alloc(s)) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], sb_writers_name[i], &type->s_writers_key[i])) goto fail; } init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; if (s->s_user_ns != &init_user_ns) s->s_iflags |= SB_I_NODEV; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_roots); mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); s->s_count = 1; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); init_rwsem(&s->s_dquot.dqio_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; s->cleancache_poolid = CLEANCACHE_NO_POOL; s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.scan_objects = super_cache_scan; s->s_shrink.count_objects = super_cache_count; s->s_shrink.batch = 1024; s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; if (prealloc_shrinker(&s->s_shrink)) goto fail; if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) goto fail; if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink)) goto fail; return s; fail: destroy_unused_super(s); return NULL; } /* Superblock refcounting */ /* * Drop a superblock's refcount. The caller must hold sb_lock. */ static void __put_super(struct super_block *s) { if (!--s->s_count) { list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); WARN_ON(!list_empty(&s->s_mounts)); security_sb_free(s); fscrypt_destroy_keyring(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); call_rcu(&s->rcu, destroy_super_rcu); } } /** * put_super - drop a temporary reference to superblock * @sb: superblock in question * * Drops a temporary reference, frees superblock if there's no * references left. */ void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); spin_unlock(&sb_lock); } /** * deactivate_locked_super - drop an active reference to superblock * @s: superblock to deactivate * * Drops an active reference to superblock, converting it into a temporary * one if there is no other active references left. In that case we * tell fs driver to shut it down and drop the temporary reference we * had just acquired. * * Caller holds exclusive lock on superblock; that lock is released. */ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { cleancache_invalidate_fs(s); unregister_shrinker(&s->s_shrink); fs->kill_sb(s); /* * Since list_lru_destroy() may sleep, we cannot call it from * put_super(), where we hold the sb_lock. Therefore we destroy * the lru lists right now. */ list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); put_filesystem(fs); put_super(s); } else { up_write(&s->s_umount); } } EXPORT_SYMBOL(deactivate_locked_super); /** * deactivate_super - drop an active reference to superblock * @s: superblock to deactivate * * Variant of deactivate_locked_super(), except that superblock is *not* * locked by caller. If we are going to drop the final active reference, * lock will be acquired prior to that. */ void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { down_write(&s->s_umount); deactivate_locked_super(s); } } EXPORT_SYMBOL(deactivate_super); /** * grab_super - acquire an active reference * @s: reference we are trying to make active * * Tries to acquire an active reference. grab_super() is used when we * had just found a superblock in super_blocks or fs_type->fs_supers * and want to turn it into a full-blown active reference. grab_super() * is called with sb_lock held and drops it. Returns 1 in case of * success, 0 if we had failed (superblock contents was already dead or * dying when grab_super() had been called). Note that this is only * called for superblocks not in rundown mode (== ones still on ->fs_supers * of their type), so increment of ->s_count is OK here. */ static int grab_super(struct super_block *s) __releases(sb_lock) { s->s_count++; spin_unlock(&sb_lock); down_write(&s->s_umount); if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) { put_super(s); return 1; } up_write(&s->s_umount); put_super(s); return 0; } /* * trylock_super - try to grab ->s_umount shared * @sb: reference we are trying to grab * * Try to prevent fs shutdown. This is used in places where we * cannot take an active reference but we need to ensure that the * filesystem is not shut down while we are working on it. It returns * false if we cannot acquire s_umount or if we lose the race and * filesystem already got into shutdown, and returns true with the s_umount * lock held in read mode in case of success. On successful return, * the caller must drop the s_umount lock when done. * * Note that unlike get_super() et.al. this one does *not* bump ->s_count. * The reason why it's safe is that we are OK with doing trylock instead * of down_read(). There's a couple of places that are OK with that, but * it's very much not a general-purpose interface. */ bool trylock_super(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { if (!hlist_unhashed(&sb->s_instances) && sb->s_root && (sb->s_flags & SB_BORN)) return true; up_read(&sb->s_umount); } return false; } /** * generic_shutdown_super - common helper for ->kill_sb() * @sb: superblock to kill * * generic_shutdown_super() does all fs-independent work on superblock * shutdown. Typical ->kill_sb() should pick all fs-specific objects * that need destruction out of superblock, call generic_shutdown_super() * and release aforementioned objects. Note: dentries and inodes _are_ * taken care of and do not need specific handling. * * Upon calling this function, the filesystem may no longer alter or * rearrange the set of dentries belonging to this super_block, nor may it * change the attachments of dentries to inodes. */ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; if (sb->s_root) { shrink_dcache_for_umount(sb); sync_filesystem(sb); sb->s_flags &= ~SB_ACTIVE; cgroup_writeback_umount(); /* Evict all inodes with zero refcount. */ evict_inodes(sb); /* * Clean up and evict any inodes that still have references due * to fsnotify or the security policy. */ fsnotify_sb_delete(sb); security_sb_delete(sb); /* * Now that all potentially-encrypted inodes have been evicted, * the fscrypt keyring can be destroyed. */ fscrypt_destroy_keyring(sb); if (sb->s_dio_done_wq) { destroy_workqueue(sb->s_dio_done_wq); sb->s_dio_done_wq = NULL; } if (sop->put_super) sop->put_super(sb); if (!list_empty(&sb->s_inodes)) { printk("VFS: Busy inodes after unmount of %s. " "Self-destruct in 5 seconds. Have a nice day...\n", sb->s_id); } } spin_lock(&sb_lock); /* should be initialized for __put_super_and_need_restart() */ hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); up_write(&sb->s_umount); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } } EXPORT_SYMBOL(generic_shutdown_super); bool mount_capable(struct fs_context *fc) { if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) return capable(CAP_SYS_ADMIN); else return ns_capable(fc->user_ns, CAP_SYS_ADMIN); } /** * sget_fc - Find or create a superblock * @fc: Filesystem context. * @test: Comparison callback * @set: Setup callback * * Find or create a superblock using the parameters stored in the filesystem * context and the two callback functions. * * If an extant superblock is matched, then that will be returned with an * elevated reference count that the caller must transfer or discard. * * If no match is made, a new superblock will be allocated and basic * initialisation will be performed (s_type, s_fs_info and s_id will be set and * the set() callback will be invoked), the superblock will be published and it * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE * as yet unset. */ struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*set)(struct super_block *, struct fs_context *)) { struct super_block *s = NULL; struct super_block *old; struct user_namespace *user_ns = fc->global ? &init_user_ns : fc->user_ns; int err; /* * Never allow s_user_ns != &init_user_ns when FS_USERNS_MOUNT is * not set, as the filesystem is likely unprepared to handle it. * This can happen when fsconfig() is called from init_user_ns with * an fs_fd opened in another user namespace. */ if (user_ns != &init_user_ns && !(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) { errorfc(fc, "VFS: Mounting from non-initial user namespace is not allowed"); return ERR_PTR(-EPERM); } retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &fc->fs_type->fs_supers, s_instances) { if (test(old, fc)) goto share_extant_sb; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(fc->fs_type, fc->sb_flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } s->s_fs_info = fc->s_fs_info; err = set(s, fc); if (err) { s->s_fs_info = NULL; spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } fc->s_fs_info = NULL; s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strlcpy(s->s_id, s->s_type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); register_shrinker_prepared(&s->s_shrink); return s; share_extant_sb: if (user_ns != old->s_user_ns) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; } EXPORT_SYMBOL(sget_fc); /** * sget - find or create a superblock * @type: filesystem type superblock should belong to * @test: comparison callback * @set: setup callback * @flags: mount flags * @data: argument to each of them */ struct super_block *sget(struct file_system_type *type, int (*test)(struct super_block *,void *), int (*set)(struct super_block *,void *), int flags, void *data) { struct user_namespace *user_ns = current_user_ns(); struct super_block *s = NULL; struct super_block *old; int err; /* We don't yet pass the user namespace of the parent * mount through to here so always use &init_user_ns * until that changes. */ if (flags & SB_SUBMOUNT) user_ns = &init_user_ns; retry: spin_lock(&sb_lock); if (test) { hlist_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (user_ns != old->s_user_ns) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; destroy_unused_super(s); return old; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; } err = set(s, data); if (err) { spin_unlock(&sb_lock); destroy_unused_super(s); return ERR_PTR(err); } s->s_type = type; strlcpy(s->s_id, type->name, sizeof(s->s_id)); list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); register_shrinker_prepared(&s->s_shrink); return s; } EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) { up_read(&sb->s_umount); put_super(sb); } EXPORT_SYMBOL(drop_super); void drop_super_exclusive(struct super_block *sb) { up_write(&sb->s_umount); put_super(sb); } EXPORT_SYMBOL(drop_super_exclusive); static void __iterate_supers(void (*f)(struct super_block *)) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (hlist_unhashed(&sb->s_instances)) continue; sb->s_count++; spin_unlock(&sb_lock); f(sb); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers - call function for all active superblocks * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (hlist_unhashed(&sb->s_instances)) continue; sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); if (sb->s_root && (sb->s_flags & SB_BORN)) f(sb, arg); up_read(&sb->s_umount); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } /** * iterate_supers_type - call function for superblocks of given type * @type: fs type * @f: function to call * @arg: argument to pass to it * * Scans the superblock list and calls given function, passing it * locked superblock and given argument. */ void iterate_supers_type(struct file_system_type *type, void (*f)(struct super_block *, void *), void *arg) { struct super_block *sb, *p = NULL; spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); if (sb->s_root && (sb->s_flags & SB_BORN)) f(sb, arg); up_read(&sb->s_umount); spin_lock(&sb_lock); if (p) __put_super(p); p = sb; } if (p) __put_super(p); spin_unlock(&sb_lock); } EXPORT_SYMBOL(iterate_supers_type); /** * get_super - get the superblock of a device * @bdev: device to get the superblock for * * Scans the superblock list and finds the superblock of the file system * mounted on the device given. %NULL is returned if no match is found. */ struct super_block *get_super(struct block_device *bdev) { struct super_block *sb; if (!bdev) return NULL; spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_bdev == bdev) { sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); /* still alive? */ if (sb->s_root && (sb->s_flags & SB_BORN)) return sb; up_read(&sb->s_umount); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); goto rescan; } } spin_unlock(&sb_lock); return NULL; } /** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for * * Scans the superblock list and finds the superblock of the file system * mounted on the device given. Returns the superblock with an active * reference or %NULL if none was found. */ struct super_block *get_active_super(struct block_device *bdev) { struct super_block *sb; if (!bdev) return NULL; restart: spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_bdev == bdev) { if (!grab_super(sb)) goto restart; up_write(&sb->s_umount); return sb; } } spin_unlock(&sb_lock); return NULL; } struct super_block *user_get_super(dev_t dev, bool excl) { struct super_block *sb; spin_lock(&sb_lock); rescan: list_for_each_entry(sb, &super_blocks, s_list) { if (hlist_unhashed(&sb->s_instances)) continue; if (sb->s_dev == dev) { sb->s_count++; spin_unlock(&sb_lock); if (excl) down_write(&sb->s_umount); else down_read(&sb->s_umount); /* still alive? */ if (sb->s_root && (sb->s_flags & SB_BORN)) return sb; if (excl) up_write(&sb->s_umount); else up_read(&sb->s_umount); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); goto rescan; } } spin_unlock(&sb_lock); return NULL; } /** * reconfigure_super - asks filesystem to change superblock parameters * @fc: The superblock and configuration * * Alters the configuration parameters of a live superblock. */ int reconfigure_super(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int retval; bool remount_ro = false; bool remount_rw = false; bool force = fc->sb_flags & SB_FORCE; if (fc->sb_flags_mask & ~MS_RMT_MASK) return -EINVAL; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; retval = security_sb_remount(sb, fc->security); if (retval) return retval; if (fc->sb_flags_mask & SB_RDONLY) { #ifdef CONFIG_BLOCK if (!(fc->sb_flags & SB_RDONLY) && sb->s_bdev && bdev_read_only(sb->s_bdev)) return -EACCES; #endif remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb); remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb); } if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { up_write(&sb->s_umount); group_pin_kill(&sb->s_pins); down_write(&sb->s_umount); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) return -EBUSY; remount_ro = !sb_rdonly(sb); } } shrink_dcache_sb(sb); /* If we are reconfiguring to RDONLY and current sb is read/write, * make sure there are no files open for writing. */ if (remount_ro) { if (force) { sb->s_readonly_remount = 1; smp_wmb(); } else { retval = sb_prepare_remount_readonly(sb); if (retval) return retval; } } else if (remount_rw) { /* * We set s_readonly_remount here to protect filesystem's * reconfigure code from writes from userspace until * reconfigure finishes. */ sb->s_readonly_remount = 1; smp_wmb(); } if (fc->ops->reconfigure) { retval = fc->ops->reconfigure(fc); if (retval) { if (!force) goto cancel_readonly; /* If forced remount, go ahead despite any errors */ WARN(1, "forced remount of a %s fs returned %i\n", sb->s_type->name, retval); } } WRITE_ONCE(sb->s_flags, ((sb->s_flags & ~fc->sb_flags_mask) | (fc->sb_flags & fc->sb_flags_mask))); /* Needs to be ordered wrt mnt_is_readonly() */ smp_wmb(); sb->s_readonly_remount = 0; /* * Some filesystems modify their metadata via some other path than the * bdev buffer cache (eg. use a private mapping, or directories in * pagecache, etc). Also file data modifications go via their own * mappings. So If we try to mount readonly then copy the filesystem * from bdev, we could get stale data, so invalidate it to give a best * effort at coherency. */ if (remount_ro && sb->s_bdev) invalidate_bdev(sb->s_bdev); return 0; cancel_readonly: sb->s_readonly_remount = 0; return retval; } static void do_emergency_remount_callback(struct super_block *sb) { down_write(&sb->s_umount); if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY | SB_FORCE, SB_RDONLY); if (!IS_ERR(fc)) { if (parse_monolithic_mount_data(fc, NULL) == 0) (void)reconfigure_super(fc); put_fs_context(fc); } } up_write(&sb->s_umount); } static void do_emergency_remount(struct work_struct *work) { __iterate_supers(do_emergency_remount_callback); kfree(work); printk("Emergency Remount complete\n"); } void emergency_remount(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_emergency_remount); schedule_work(work); } } static void do_thaw_all_callback(struct super_block *sb) { down_write(&sb->s_umount); if (sb->s_root && sb->s_flags & SB_BORN) { emergency_thaw_bdev(sb); thaw_super_locked(sb); } else { up_write(&sb->s_umount); } } static void do_thaw_all(struct work_struct *work) { __iterate_supers(do_thaw_all_callback); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } /** * emergency_thaw_all -- forcibly thaw every frozen filesystem * * Used for emergency unfreeze of all filesystems via SysRq */ void emergency_thaw_all(void) { struct work_struct *work; work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { INIT_WORK(work, do_thaw_all); schedule_work(work); } } static DEFINE_IDA(unnamed_dev_ida); /** * get_anon_bdev - Allocate a block device for filesystems which don't have one. * @p: Pointer to a dev_t. * * Filesystems which don't use real block devices can call this function * to allocate a virtual block device. * * Context: Any context. Frequently called while holding sb_lock. * Return: 0 on success, -EMFILE if there are no anonymous bdevs left * or -ENOMEM if memory allocation failed. */ int get_anon_bdev(dev_t *p) { int dev; /* * Many userspace utilities consider an FSID of 0 invalid. * Always return at least 1 from get_anon_bdev. */ dev = ida_alloc_range(&unnamed_dev_ida, 1, (1 << MINORBITS) - 1, GFP_ATOMIC); if (dev == -ENOSPC) dev = -EMFILE; if (dev < 0) return dev; *p = MKDEV(0, dev); return 0; } EXPORT_SYMBOL(get_anon_bdev); void free_anon_bdev(dev_t dev) { ida_free(&unnamed_dev_ida, MINOR(dev)); } EXPORT_SYMBOL(free_anon_bdev); int set_anon_super(struct super_block *s, void *data) { return get_anon_bdev(&s->s_dev); } EXPORT_SYMBOL(set_anon_super); void kill_anon_super(struct super_block *sb) { dev_t dev = sb->s_dev; generic_shutdown_super(sb); free_anon_bdev(dev); } EXPORT_SYMBOL(kill_anon_super); void kill_litter_super(struct super_block *sb) { if (sb->s_root) d_genocide(sb->s_root); kill_anon_super(sb); } EXPORT_SYMBOL(kill_litter_super); int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); } EXPORT_SYMBOL(set_anon_super_fc); static int test_keyed_super(struct super_block *sb, struct fs_context *fc) { return sb->s_fs_info == fc->s_fs_info; } static int test_single_super(struct super_block *s, struct fs_context *fc) { return 1; } /** * vfs_get_super - Get a superblock with a search key set in s_fs_info. * @fc: The filesystem context holding the parameters * @keying: How to distinguish superblocks * @fill_super: Helper to initialise a new superblock * * Search for a superblock and create a new one if not found. The search * criterion is controlled by @keying. If the search fails, a new superblock * is created and @fill_super() is called to initialise it. * * @keying can take one of a number of values: * * (1) vfs_get_single_super - Only one superblock of this type may exist on the * system. This is typically used for special system filesystems. * * (2) vfs_get_keyed_super - Multiple superblocks may exist, but they must have * distinct keys (where the key is in s_fs_info). Searching for the same * key again will turn up the superblock for that key. * * (3) vfs_get_independent_super - Multiple superblocks may exist and are * unkeyed. Each call will get a new superblock. * * A permissions check is made by sget_fc() unless we're getting a superblock * for a kernel-internal mount or a submount. */ int vfs_get_super(struct fs_context *fc, enum vfs_get_super_keying keying, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { int (*test)(struct super_block *, struct fs_context *); struct super_block *sb; int err; switch (keying) { case vfs_get_single_super: case vfs_get_single_reconf_super: test = test_single_super; break; case vfs_get_keyed_super: test = test_keyed_super; break; case vfs_get_independent_super: test = NULL; break; default: BUG(); } sb = sget_fc(fc, test, set_anon_super_fc); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = fill_super(sb, fc); if (err) goto error; sb->s_flags |= SB_ACTIVE; fc->root = dget(sb->s_root); } else { fc->root = dget(sb->s_root); if (keying == vfs_get_single_reconf_super) { err = reconfigure_super(fc); if (err < 0) { dput(fc->root); fc->root = NULL; goto error; } } } return 0; error: deactivate_locked_super(sb); return err; } EXPORT_SYMBOL(vfs_get_super); int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, vfs_get_independent_super, fill_super); } EXPORT_SYMBOL(get_tree_nodev); int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, vfs_get_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); int get_tree_single_reconf(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { return vfs_get_super(fc, vfs_get_single_reconf_super, fill_super); } EXPORT_SYMBOL(get_tree_single_reconf); int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key) { fc->s_fs_info = key; return vfs_get_super(fc, vfs_get_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); #ifdef CONFIG_BLOCK static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); if (blk_queue_stable_writes(s->s_bdev->bd_disk->queue)) s->s_iflags |= SB_I_STABLE_WRITES; return 0; } static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc) { return set_bdev_super(s, fc->sget_key); } static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) { return s->s_bdev == fc->sget_key; } /** * get_tree_bdev - Get a superblock based on a single block device * @fc: The filesystem context holding the parameters * @fill_super: Helper to initialise a new superblock */ int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *, struct fs_context *)) { struct block_device *bdev; struct super_block *s; fmode_t mode = FMODE_READ | FMODE_EXCL; int error = 0; if (!(fc->sb_flags & SB_RDONLY)) mode |= FMODE_WRITE; if (!fc->source) return invalf(fc, "No source specified"); bdev = blkdev_get_by_path(fc->source, mode, fc->fs_type); if (IS_ERR(bdev)) { errorf(fc, "%s: Can't open blockdev", fc->source); return PTR_ERR(bdev); } /* Once the superblock is inserted into the list by sget_fc(), s_umount * will protect the lockfs code from trying to start a snapshot while * we are mounting */ mutex_lock(&bdev->bd_fsfreeze_mutex); if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); blkdev_put(bdev, mode); return -EBUSY; } fc->sb_flags |= SB_NOSEC; fc->sget_key = bdev; s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); mutex_unlock(&bdev->bd_fsfreeze_mutex); if (IS_ERR(s)) { blkdev_put(bdev, mode); return PTR_ERR(s); } if (s->s_root) { /* Don't summarily change the RO/RW state. */ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { warnf(fc, "%pg: Can't mount, would change RO state", bdev); deactivate_locked_super(s); blkdev_put(bdev, mode); return -EBUSY; } /* * s_umount nests inside open_mutex during * __invalidate_device(). blkdev_put() acquires * open_mutex and can't be called under s_umount. Drop * s_umount temporarily. This is safe as we're * holding an active reference. */ up_write(&s->s_umount); blkdev_put(bdev, mode); down_write(&s->s_umount); } else { s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, fc); if (error) { deactivate_locked_super(s); return error; } s->s_flags |= SB_ACTIVE; bdev->bd_super = s; } BUG_ON(fc->root); fc->root = dget(s->s_root); return 0; } EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { return (void *)s->s_bdev == data; } struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct block_device *bdev; struct super_block *s; fmode_t mode = FMODE_READ | FMODE_EXCL; int error = 0; if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; bdev = blkdev_get_by_path(dev_name, mode, fs_type); if (IS_ERR(bdev)) return ERR_CAST(bdev); /* * once the super is inserted into the list by sget, s_umount * will protect the lockfs code from trying to start a snapshot * while we are mounting */ mutex_lock(&bdev->bd_fsfreeze_mutex); if (bdev->bd_fsfreeze_count > 0) { mutex_unlock(&bdev->bd_fsfreeze_mutex); error = -EBUSY; goto error_bdev; } s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC, bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); if (IS_ERR(s)) goto error_s; if (s->s_root) { if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); error = -EBUSY; goto error_bdev; } /* * s_umount nests inside open_mutex during * __invalidate_device(). blkdev_put() acquires * open_mutex and can't be called under s_umount. Drop * s_umount temporarily. This is safe as we're * holding an active reference. */ up_write(&s->s_umount); blkdev_put(bdev, mode); down_write(&s->s_umount); } else { s->s_mode = mode; snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); goto error; } s->s_flags |= SB_ACTIVE; bdev->bd_super = s; } return dget(s->s_root); error_s: error = PTR_ERR(s); error_bdev: blkdev_put(bdev, mode); error: return ERR_PTR(error); } EXPORT_SYMBOL(mount_bdev); void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; fmode_t mode = sb->s_mode; bdev->bd_super = NULL; generic_shutdown_super(sb); sync_blockdev(bdev); WARN_ON_ONCE(!(mode & FMODE_EXCL)); blkdev_put(bdev, mode | FMODE_EXCL); } EXPORT_SYMBOL(kill_block_super); #endif struct dentry *mount_nodev(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { int error; struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; return dget(s->s_root); } EXPORT_SYMBOL(mount_nodev); int reconfigure_single(struct super_block *s, int flags, void *data) { struct fs_context *fc; int ret; /* The caller really need to be passing fc down into mount_single(), * then a chunk of this can be removed. [Bollocks -- AV] * Better yet, reconfiguration shouldn't happen, but rather the second * mount should be rejected if the parameters are not compatible. */ fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); ret = parse_monolithic_mount_data(fc, data); if (ret < 0) goto out; ret = reconfigure_super(fc); out: put_fs_context(fc); return ret; } static int compare_single(struct super_block *s, void *p) { return 1; } struct dentry *mount_single(struct file_system_type *fs_type, int flags, void *data, int (*fill_super)(struct super_block *, void *, int)) { struct super_block *s; int error; s = sget(fs_type, compare_single, set_anon_super, flags, NULL); if (IS_ERR(s)) return ERR_CAST(s); if (!s->s_root) { error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (!error) s->s_flags |= SB_ACTIVE; } else { error = reconfigure_single(s, flags, data); } if (unlikely(error)) { deactivate_locked_super(s); return ERR_PTR(error); } return dget(s->s_root); } EXPORT_SYMBOL(mount_single); /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. * * The filesystem is invoked to get or create a superblock which can then later * be used for mounting. The filesystem places a pointer to the root to be * used for mounting in @fc->root. */ int vfs_get_tree(struct fs_context *fc) { struct super_block *sb; int error; if (fc->root) return -EBUSY; /* Get the mountable root in fc->root, with a ref on the root and a ref * on the superblock. */ error = fc->ops->get_tree(fc); if (error < 0) return error; if (!fc->root) { pr_err("Filesystem %s get_tree() didn't set fc->root\n", fc->fs_type->name); /* We don't know what the locking state of the superblock is - * if there is a superblock. */ BUG(); } sb = fc->root->d_sb; WARN_ON(!sb->s_bdi); /* * Write barrier is for super_cache_count(). We place it before setting * SB_BORN as the data dependency between the two functions is the * superblock structure contents that we just set up, not the SB_BORN * flag. */ smp_wmb(); sb->s_flags |= SB_BORN; error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { fc_drop_locked(fc); return error; } /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE * but s_maxbytes was an unsigned long long for many releases. Throw * this warning for a little while to try and catch filesystems that * violate this rule. */ WARN((sb->s_maxbytes < 0), "%s set sb->s_maxbytes to " "negative value (%lld)\n", fc->fs_type->name, sb->s_maxbytes); return 0; } EXPORT_SYMBOL(vfs_get_tree); /* * Setup private BDI for given superblock. It gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) { struct backing_dev_info *bdi; int err; va_list args; bdi = bdi_alloc(NUMA_NO_NODE); if (!bdi) return -ENOMEM; va_start(args, fmt); err = bdi_register_va(bdi, fmt, args); va_end(args); if (err) { bdi_put(bdi); return err; } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; sb->s_iflags |= SB_I_PERSB_BDI; return 0; } EXPORT_SYMBOL(super_setup_bdi_name); /* * Setup private BDI for given superblock. I gets automatically cleaned up * in generic_shutdown_super(). */ int super_setup_bdi(struct super_block *sb) { static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name, atomic_long_inc_return(&bdi_seq)); } EXPORT_SYMBOL(super_setup_bdi); /** * sb_wait_write - wait until all writers to given file system finish * @sb: the super for which we wait * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file * system. */ static void sb_wait_write(struct super_block *sb, int level) { percpu_down_write(sb->s_writers.rw_sem + level-1); } /* * We are going to return to userspace and forget about these locks, the * ownership goes to the caller of thaw_super() which does unlock(). */ static void lockdep_sb_freeze_release(struct super_block *sb) { int level; for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) percpu_rwsem_release(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } /* * Tell lockdep we are holding these locks before we call ->unfreeze_fs(sb). */ static void lockdep_sb_freeze_acquire(struct super_block *sb) { int level; for (level = 0; level < SB_FREEZE_LEVELS; ++level) percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); } static void sb_freeze_unlock(struct super_block *sb, int level) { for (level--; level >= 0; level--) percpu_up_write(sb->s_writers.rw_sem + level); } /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock * * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs will return * -EBUSY. * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. * * SB_FREEZE_WRITE: The file system is in the process of being frozen. New * writes should be blocked, though page faults are still allowed. We wait for * all writes to complete and then proceed to the next stage. * * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked * but internal fs threads can still modify the filesystem (although they * should not dirty new pages or inodes), writeback can run etc. After waiting * for all running page faults we sync the filesystem which will clean all * dirty pages and inodes (no new dirty pages or inodes can be created when * sync is running). * * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs * modification are blocked (e.g. XFS preallocation truncation on inode * reclaim). This is usually implemented by blocking new transactions for * filesystems that have them and need this additional guard. After all * internal writers are finished we call ->freeze_fs() to finish filesystem * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is * mostly auxiliary for filesystems to verify they do not modify frozen fs. * * sb->s_writers.frozen is protected by sb->s_umount. */ int freeze_super(struct super_block *sb) { int ret; atomic_inc(&sb->s_active); down_write(&sb->s_umount); if (sb->s_writers.frozen != SB_UNFROZEN) { deactivate_locked_super(sb); return -EBUSY; } if (!(sb->s_flags & SB_BORN)) { up_write(&sb->s_umount); return 0; /* sic - it's "nothing to do" */ } if (sb_rdonly(sb)) { /* Nothing to do really... */ sb->s_writers.frozen = SB_FREEZE_COMPLETE; up_write(&sb->s_umount); return 0; } sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ up_write(&sb->s_umount); sb_wait_write(sb, SB_FREEZE_WRITE); down_write(&sb->s_umount); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ ret = sync_filesystem(sb); if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; } /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { ret = sb->s_op->freeze_fs(sb); if (ret) { printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; } } /* * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ sb->s_writers.frozen = SB_FREEZE_COMPLETE; lockdep_sb_freeze_release(sb); up_write(&sb->s_umount); return 0; } EXPORT_SYMBOL(freeze_super); static int thaw_super_locked(struct super_block *sb) { int error; if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { up_write(&sb->s_umount); return -EINVAL; } if (sb_rdonly(sb)) { sb->s_writers.frozen = SB_UNFROZEN; goto out; } lockdep_sb_freeze_acquire(sb); if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { printk(KERN_ERR "VFS:Filesystem thaw failed\n"); lockdep_sb_freeze_release(sb); up_write(&sb->s_umount); return error; } } sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return 0; } /** * thaw_super -- unlock filesystem * @sb: the super to thaw * * Unlocks the filesystem and marks it writeable again after freeze_super(). */ int thaw_super(struct super_block *sb) { down_write(&sb->s_umount); return thaw_super_locked(sb); } EXPORT_SYMBOL(thaw_super);
6 23 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_GENERIC_NETLINK_H #define __NET_GENERIC_NETLINK_H #include <linux/genetlink.h> #include <net/netlink.h> #include <net/net_namespace.h> #define GENLMSG_DEFAULT_SIZE (NLMSG_DEFAULT_SIZE - GENL_HDRLEN) /** * struct genl_multicast_group - generic netlink multicast group * @name: name of the multicast group, names are per-family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @cap_sys_admin: whether %CAP_SYS_ADMIN is required for binding */ struct genl_multicast_group { char name[GENL_NAMSIZ]; u8 flags; u8 cap_sys_admin:1; }; struct genl_ops; struct genl_info; /** * struct genl_family - generic netlink family * @id: protocol family identifier (private) * @hdrsize: length of user specific header in bytes * @name: name of family * @version: protocol version * @maxattr: maximum number of attributes supported * @policy: netlink policy * @netnsok: set to true if the family can handle network * namespaces and should be presented in all of them * @parallel_ops: operations can be called in parallel and aren't * synchronized by the core genetlink code * @pre_doit: called before an operation's doit callback, it may * do additional, common, filtering and return an error * @post_doit: called after an operation's doit callback, it may * undo operations done by pre_doit, for example release locks * @mcgrps: multicast groups used by this family * @n_mcgrps: number of multicast groups * @mcgrp_offset: starting number of multicast group IDs in this family * (private) * @ops: the operations supported by this family * @n_ops: number of operations supported by this family * @small_ops: the small-struct operations supported by this family * @n_small_ops: number of small-struct operations supported by this family */ struct genl_family { int id; /* private */ unsigned int hdrsize; char name[GENL_NAMSIZ]; unsigned int version; unsigned int maxattr; unsigned int mcgrp_offset; /* private */ u8 netnsok:1; u8 parallel_ops:1; u8 n_ops; u8 n_small_ops; u8 n_mcgrps; const struct nla_policy *policy; int (*pre_doit)(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info); void (*post_doit)(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info); const struct genl_ops * ops; const struct genl_small_ops *small_ops; const struct genl_multicast_group *mcgrps; struct module *module; }; /** * struct genl_info - receiving information * @snd_seq: sending sequence number * @snd_portid: netlink portid of sender * @nlhdr: netlink message header * @genlhdr: generic netlink message header * @userhdr: user specific header * @attrs: netlink attributes * @_net: network namespace * @user_ptr: user pointers * @extack: extended ACK report struct */ struct genl_info { u32 snd_seq; u32 snd_portid; struct nlmsghdr * nlhdr; struct genlmsghdr * genlhdr; void * userhdr; struct nlattr ** attrs; possible_net_t _net; void * user_ptr[2]; struct netlink_ext_ack *extack; }; static inline struct net *genl_info_net(struct genl_info *info) { return read_pnet(&info->_net); } static inline void genl_info_net_set(struct genl_info *info, struct net *net) { write_pnet(&info->_net, net); } #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg) enum genl_validate_flags { GENL_DONT_VALIDATE_STRICT = BIT(0), GENL_DONT_VALIDATE_DUMP = BIT(1), GENL_DONT_VALIDATE_DUMP_STRICT = BIT(2), }; /** * struct genl_small_ops - generic netlink operations (small version) * @cmd: command identifier * @internal_flags: flags used by the family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @validate: validation flags from enum genl_validate_flags * @doit: standard command callback * @dumpit: callback for dumpers * * This is a cut-down version of struct genl_ops for users who don't need * most of the ancillary infra and want to save space. */ struct genl_small_ops { int (*doit)(struct sk_buff *skb, struct genl_info *info); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); u8 cmd; u8 internal_flags; u8 flags; u8 validate; }; /** * struct genl_ops - generic netlink operations * @cmd: command identifier * @internal_flags: flags used by the family * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM) * @maxattr: maximum number of attributes supported * @policy: netlink policy (takes precedence over family policy) * @validate: validation flags from enum genl_validate_flags * @doit: standard command callback * @start: start callback for dumps * @dumpit: callback for dumpers * @done: completion callback for dumps */ struct genl_ops { int (*doit)(struct sk_buff *skb, struct genl_info *info); int (*start)(struct netlink_callback *cb); int (*dumpit)(struct sk_buff *skb, struct netlink_callback *cb); int (*done)(struct netlink_callback *cb); const struct nla_policy *policy; unsigned int maxattr; u8 cmd; u8 internal_flags; u8 flags; u8 validate; }; /** * struct genl_info - info that is available during dumpit op call * @family: generic netlink family - for internal genl code usage * @ops: generic netlink ops - for internal genl code usage * @attrs: netlink attributes */ struct genl_dumpit_info { const struct genl_family *family; struct genl_ops op; struct nlattr **attrs; }; static inline const struct genl_dumpit_info * genl_dumpit_info(struct netlink_callback *cb) { return cb->data; } int genl_register_family(struct genl_family *family); int genl_unregister_family(const struct genl_family *family); void genl_notify(const struct genl_family *family, struct sk_buff *skb, struct genl_info *info, u32 group, gfp_t flags); void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, const struct genl_family *family, int flags, u8 cmd); /** * genlmsg_nlhdr - Obtain netlink header from user specified header * @user_hdr: user header as returned from genlmsg_put() * * Returns pointer to netlink header. */ static inline struct nlmsghdr *genlmsg_nlhdr(void *user_hdr) { return (struct nlmsghdr *)((char *)user_hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_parse_deprecated - parse attributes of a genetlink message * @nlh: netlink message header * @family: genetlink message family * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int genlmsg_parse_deprecated(const struct nlmsghdr *nlh, const struct genl_family *family, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * genlmsg_parse - parse attributes of a genetlink message * @nlh: netlink message header * @family: genetlink message family * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int genlmsg_parse(const struct nlmsghdr *nlh, const struct genl_family *family, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * genl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number * @user_hdr: user header as returned from genlmsg_put() * * Cf. nl_dump_check_consistent(), this just provides a wrapper to make it * simpler to use with generic netlink. */ static inline void genl_dump_check_consistent(struct netlink_callback *cb, void *user_hdr) { nl_dump_check_consistent(cb, genlmsg_nlhdr(user_hdr)); } /** * genlmsg_put_reply - Add generic netlink header to a reply message * @skb: socket buffer holding the message * @info: receiver info * @family: generic netlink family * @flags: netlink message flags * @cmd: generic netlink command * * Returns pointer to user specific header */ static inline void *genlmsg_put_reply(struct sk_buff *skb, struct genl_info *info, const struct genl_family *family, int flags, u8 cmd) { return genlmsg_put(skb, info->snd_portid, info->snd_seq, family, flags, cmd); } /** * genlmsg_end - Finalize a generic netlink message * @skb: socket buffer the message is stored in * @hdr: user specific header */ static inline void genlmsg_end(struct sk_buff *skb, void *hdr) { nlmsg_end(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_cancel - Cancel construction of a generic netlink message * @skb: socket buffer the message is stored in * @hdr: generic netlink message header */ static inline void genlmsg_cancel(struct sk_buff *skb, void *hdr) { if (hdr) nlmsg_cancel(skb, hdr - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_multicast_netns - multicast a netlink message to a specific netns * @family: the generic netlink family * @net: the net namespace * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags */ static inline int genlmsg_multicast_netns(const struct genl_family *family, struct net *net, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return nlmsg_multicast(net->genl_sock, skb, portid, group, flags); } /** * genlmsg_multicast - multicast a netlink message to the default netns * @family: the generic netlink family * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * @flags: allocation flags */ static inline int genlmsg_multicast(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { return genlmsg_multicast_netns(family, &init_net, skb, portid, group, flags); } /** * genlmsg_multicast_allns - multicast a netlink message to all net namespaces * @family: the generic netlink family * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: offset of multicast group in groups array * * This function must hold the RTNL or rcu_read_lock(). */ int genlmsg_multicast_allns(const struct genl_family *family, struct sk_buff *skb, u32 portid, unsigned int group); /** * genlmsg_unicast - unicast a netlink message * @skb: netlink message as socket buffer * @portid: netlink portid of the destination socket */ static inline int genlmsg_unicast(struct net *net, struct sk_buff *skb, u32 portid) { return nlmsg_unicast(net->genl_sock, skb, portid); } /** * genlmsg_reply - reply to a request * @skb: netlink message to be sent back * @info: receiver information */ static inline int genlmsg_reply(struct sk_buff *skb, struct genl_info *info) { return genlmsg_unicast(genl_info_net(info), skb, info->snd_portid); } /** * gennlmsg_data - head of message payload * @gnlh: genetlink message header */ static inline void *genlmsg_data(const struct genlmsghdr *gnlh) { return ((unsigned char *) gnlh + GENL_HDRLEN); } /** * genlmsg_len - length of message payload * @gnlh: genetlink message header */ static inline int genlmsg_len(const struct genlmsghdr *gnlh) { struct nlmsghdr *nlh = (struct nlmsghdr *)((unsigned char *)gnlh - NLMSG_HDRLEN); return (nlh->nlmsg_len - GENL_HDRLEN - NLMSG_HDRLEN); } /** * genlmsg_msg_size - length of genetlink message not including padding * @payload: length of message payload */ static inline int genlmsg_msg_size(int payload) { return GENL_HDRLEN + payload; } /** * genlmsg_total_size - length of genetlink message including padding * @payload: length of message payload */ static inline int genlmsg_total_size(int payload) { return NLMSG_ALIGN(genlmsg_msg_size(payload)); } /** * genlmsg_new - Allocate a new generic netlink message * @payload: size of the message payload * @flags: the type of memory to allocate. */ static inline struct sk_buff *genlmsg_new(size_t payload, gfp_t flags) { return nlmsg_new(genlmsg_total_size(payload), flags); } /** * genl_set_err - report error to genetlink broadcast listeners * @family: the generic netlink family * @net: the network namespace to report the error to * @portid: the PORTID of a process that we want to skip (if any) * @group: the broadcast group that will notice the error * (this is the offset of the multicast group in the groups array) * @code: error code, must be negative (as usual in kernelspace) * * This function returns the number of broadcast listeners that have set the * NETLINK_RECV_NO_ENOBUFS socket option. */ static inline int genl_set_err(const struct genl_family *family, struct net *net, u32 portid, u32 group, int code) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return netlink_set_err(net->genl_sock, portid, group, code); } static inline int genl_has_listeners(const struct genl_family *family, struct net *net, unsigned int group) { if (WARN_ON_ONCE(group >= family->n_mcgrps)) return -EINVAL; group = family->mcgrp_offset + group; return netlink_has_listeners(net->genl_sock, group); } #endif /* __NET_GENERIC_NETLINK_H */
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 // SPDX-License-Identifier: GPL-2.0 /* * Block rq-qos base io controller * * This works similar to wbt with a few exceptions * * - It's bio based, so the latency covers the whole block layer in addition to * the actual io. * - We will throttle all IO that comes in here if we need to. * - We use the mean latency over the 100ms window. This is because writes can * be particularly fast, which could give us a false sense of the impact of * other workloads on our protected workload. * - By default there's no throttling, we set the queue_depth to UINT_MAX so * that we can have as many outstanding bio's as we're allowed to. Only at * throttle time do we pay attention to the actual queue depth. * * The hierarchy works like the cpu controller does, we track the latency at * every configured node, and each configured node has it's own independent * queue depth. This means that we only care about our latency targets at the * peer level. Some group at the bottom of the hierarchy isn't going to affect * a group at the end of some other path if we're only configred at leaf level. * * Consider the following * * root blkg * / \ * fast (target=5ms) slow (target=10ms) * / \ / \ * a b normal(15ms) unloved * * "a" and "b" have no target, but their combined io under "fast" cannot exceed * an average latency of 5ms. If it does then we will throttle the "slow" * group. In the case of "normal", if it exceeds its 15ms target, we will * throttle "unloved", but nobody else. * * In this example "fast", "slow", and "normal" will be the only groups actually * accounting their io latencies. We have to walk up the heirarchy to the root * on every submit and complete so we can do the appropriate stat recording and * adjust the queue depth of ourselves if needed. * * There are 2 ways we throttle IO. * * 1) Queue depth throttling. As we throttle down we will adjust the maximum * number of IO's we're allowed to have in flight. This starts at (u64)-1 down * to 1. If the group is only ever submitting IO for itself then this is the * only way we throttle. * * 2) Induced delay throttling. This is for the case that a group is generating * IO that has to be issued by the root cg to avoid priority inversion. So think * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot * of work done for us on behalf of the root cg and are being asked to scale * down more then we induce a latency at userspace return. We accumulate the * total amount of time we need to be punished by doing * * total_time += min_lat_nsec - actual_io_completion * * and then at throttle time will do * * throttle_time = min(total_time, NSEC_PER_SEC) * * This induced delay will throttle back the activity that is generating the * root cg issued io's, wethere that's some metadata intensive operation or the * group is using so much memory that it is pushing us into swap. * * Copyright (C) 2018 Josef Bacik */ #include <linux/kernel.h> #include <linux/blk_types.h> #include <linux/backing-dev.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/memcontrol.h> #include <linux/sched/loadavg.h> #include <linux/sched/signal.h> #include <trace/events/block.h> #include <linux/blk-mq.h> #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk.h" #define DEFAULT_SCALE_COOKIE 1000000U static struct blkcg_policy blkcg_policy_iolatency; struct iolatency_grp; struct blk_iolatency { struct rq_qos rqos; struct timer_list timer; /* * ->enabled is the master enable switch gating the throttling logic and * inflight tracking. The number of cgroups which have iolat enabled is * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly * from ->enable_work with the request_queue frozen. For details, See * blkiolatency_enable_work_fn(). */ bool enabled; atomic_t enable_cnt; struct work_struct enable_work; }; static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) { return container_of(rqos, struct blk_iolatency, rqos); } struct child_latency_info { spinlock_t lock; /* Last time we adjusted the scale of everybody. */ u64 last_scale_event; /* The latency that we missed. */ u64 scale_lat; /* Total io's from all of our children for the last summation. */ u64 nr_samples; /* The guy who actually changed the latency numbers. */ struct iolatency_grp *scale_grp; /* Cookie to tell if we need to scale up or down. */ atomic_t scale_cookie; }; struct percentile_stats { u64 total; u64 missed; }; struct latency_stat { union { struct percentile_stats ps; struct blk_rq_stat rqs; }; }; struct iolatency_grp { struct blkg_policy_data pd; struct latency_stat __percpu *stats; struct latency_stat cur_stat; struct blk_iolatency *blkiolat; struct rq_depth rq_depth; struct rq_wait rq_wait; atomic64_t window_start; atomic_t scale_cookie; u64 min_lat_nsec; u64 cur_win_nsec; /* total running average of our io latency. */ u64 lat_avg; /* Our current number of IO's for the last summation. */ u64 nr_samples; bool ssd; struct child_latency_info child_lat; }; #define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC) #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC /* * These are the constants used to fake the fixed-point moving average * calculation just like load average. The call to calc_load() folds * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling * window size is bucketed to try to approximately calculate average * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows * elapse immediately. Note, windows only elapse with IO activity. Idle * periods extend the most recent window. */ #define BLKIOLATENCY_NR_EXP_FACTORS 5 #define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \ (BLKIOLATENCY_NR_EXP_FACTORS - 1)) static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = { 2045, // exp(1/600) - 600 samples 2039, // exp(1/240) - 240 samples 2031, // exp(1/120) - 120 samples 2023, // exp(1/80) - 80 samples 2014, // exp(1/60) - 60 samples }; static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd) { return pd ? container_of(pd, struct iolatency_grp, pd) : NULL; } static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg) { return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency)); } static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat) { return pd_to_blkg(&iolat->pd); } static inline void latency_stat_init(struct iolatency_grp *iolat, struct latency_stat *stat) { if (iolat->ssd) { stat->ps.total = 0; stat->ps.missed = 0; } else blk_rq_stat_init(&stat->rqs); } static inline void latency_stat_sum(struct iolatency_grp *iolat, struct latency_stat *sum, struct latency_stat *stat) { if (iolat->ssd) { sum->ps.total += stat->ps.total; sum->ps.missed += stat->ps.missed; } else blk_rq_stat_sum(&sum->rqs, &stat->rqs); } static inline void latency_stat_record_time(struct iolatency_grp *iolat, u64 req_time) { struct latency_stat *stat = get_cpu_ptr(iolat->stats); if (iolat->ssd) { if (req_time >= iolat->min_lat_nsec) stat->ps.missed++; stat->ps.total++; } else blk_rq_stat_add(&stat->rqs, req_time); put_cpu_ptr(stat); } static inline bool latency_sum_ok(struct iolatency_grp *iolat, struct latency_stat *stat) { if (iolat->ssd) { u64 thresh = div64_u64(stat->ps.total, 10); thresh = max(thresh, 1ULL); return stat->ps.missed < thresh; } return stat->rqs.mean <= iolat->min_lat_nsec; } static inline u64 latency_stat_samples(struct iolatency_grp *iolat, struct latency_stat *stat) { if (iolat->ssd) return stat->ps.total; return stat->rqs.nr_samples; } static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, struct latency_stat *stat) { int exp_idx; if (iolat->ssd) return; /* * calc_load() takes in a number stored in fixed point representation. * Because we are using this for IO time in ns, the values stored * are significantly larger than the FIXED_1 denominator (2048). * Therefore, rounding errors in the calculation are negligible and * can be ignored. */ exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, div64_u64(iolat->cur_win_nsec, BLKIOLATENCY_EXP_BUCKET_SIZE)); iolat->lat_avg = calc_load(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat->rqs.mean); } static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) { atomic_dec(&rqw->inflight); wake_up(&rqw->wait); } static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) { struct iolatency_grp *iolat = private_data; return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); } static void __blkcg_iolatency_throttle(struct rq_qos *rqos, struct iolatency_grp *iolat, bool issue_as_root, bool use_memdelay) { struct rq_wait *rqw = &iolat->rq_wait; unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); if (use_delay) blkcg_schedule_throttle(rqos->q, use_memdelay); /* * To avoid priority inversions we want to just take a slot if we are * issuing as root. If we're being killed off there's no point in * delaying things, we may have been killed by OOM so throttling may * make recovery take even longer, so just let the IO's through so the * task can go away. */ if (issue_as_root || fatal_signal_pending(current)) { atomic_inc(&rqw->inflight); return; } rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb); } #define SCALE_DOWN_FACTOR 2 #define SCALE_UP_FACTOR 4 static inline unsigned long scale_amount(unsigned long qd, bool up) { return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL); } /* * We scale the qd down faster than we scale up, so we need to use this helper * to adjust the scale_cookie accordingly so we don't prematurely get * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much. * * Each group has their own local copy of the last scale cookie they saw, so if * the global scale cookie goes up or down they know which way they need to go * based on their last knowledge of it. */ static void scale_cookie_change(struct blk_iolatency *blkiolat, struct child_latency_info *lat_info, bool up) { unsigned long qd = blkiolat->rqos.q->nr_requests; unsigned long scale = scale_amount(qd, up); unsigned long old = atomic_read(&lat_info->scale_cookie); unsigned long max_scale = qd << 1; unsigned long diff = 0; if (old < DEFAULT_SCALE_COOKIE) diff = DEFAULT_SCALE_COOKIE - old; if (up) { if (scale + old > DEFAULT_SCALE_COOKIE) atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE); else if (diff > qd) atomic_inc(&lat_info->scale_cookie); else atomic_add(scale, &lat_info->scale_cookie); } else { /* * We don't want to dig a hole so deep that it takes us hours to * dig out of it. Just enough that we don't throttle/unthrottle * with jagged workloads but can still unthrottle once pressure * has sufficiently dissipated. */ if (diff > qd) { if (diff < max_scale) atomic_dec(&lat_info->scale_cookie); } else { atomic_sub(scale, &lat_info->scale_cookie); } } } /* * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the * queue depth at a time so we don't get wild swings and hopefully dial in to * fairer distribution of the overall queue depth. */ static void scale_change(struct iolatency_grp *iolat, bool up) { unsigned long qd = iolat->blkiolat->rqos.q->nr_requests; unsigned long scale = scale_amount(qd, up); unsigned long old = iolat->rq_depth.max_depth; if (old > qd) old = qd; if (up) { if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat))) return; if (old < qd) { old += scale; old = min(old, qd); iolat->rq_depth.max_depth = old; wake_up_all(&iolat->rq_wait.wait); } } else { old >>= 1; iolat->rq_depth.max_depth = max(old, 1UL); } } /* Check our parent and see if the scale cookie has changed. */ static void check_scale_change(struct iolatency_grp *iolat) { struct iolatency_grp *parent; struct child_latency_info *lat_info; unsigned int cur_cookie; unsigned int our_cookie = atomic_read(&iolat->scale_cookie); u64 scale_lat; unsigned int old; int direction = 0; if (lat_to_blkg(iolat)->parent == NULL) return; parent = blkg_to_lat(lat_to_blkg(iolat)->parent); if (!parent) return; lat_info = &parent->child_lat; cur_cookie = atomic_read(&lat_info->scale_cookie); scale_lat = READ_ONCE(lat_info->scale_lat); if (cur_cookie < our_cookie) direction = -1; else if (cur_cookie > our_cookie) direction = 1; else return; old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie); /* Somebody beat us to the punch, just bail. */ if (old != our_cookie) return; if (direction < 0 && iolat->min_lat_nsec) { u64 samples_thresh; if (!scale_lat || iolat->min_lat_nsec <= scale_lat) return; /* * Sometimes high priority groups are their own worst enemy, so * instead of taking it out on some poor other group that did 5% * or less of the IO's for the last summation just skip this * scale down event. */ samples_thresh = lat_info->nr_samples * 5; samples_thresh = max(1ULL, div64_u64(samples_thresh, 100)); if (iolat->nr_samples <= samples_thresh) return; } /* We're as low as we can go. */ if (iolat->rq_depth.max_depth == 1 && direction < 0) { blkcg_use_delay(lat_to_blkg(iolat)); return; } /* We're back to the default cookie, unthrottle all the things. */ if (cur_cookie == DEFAULT_SCALE_COOKIE) { blkcg_clear_delay(lat_to_blkg(iolat)); iolat->rq_depth.max_depth = UINT_MAX; wake_up_all(&iolat->rq_wait.wait); return; } scale_change(iolat, direction > 0); } static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) { struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); struct blkcg_gq *blkg = bio->bi_blkg; bool issue_as_root = bio_issue_as_root_blkg(bio); if (!blkiolat->enabled) return; while (blkg && blkg->parent) { struct iolatency_grp *iolat = blkg_to_lat(blkg); if (!iolat) { blkg = blkg->parent; continue; } check_scale_change(iolat); __blkcg_iolatency_throttle(rqos, iolat, issue_as_root, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); blkg = blkg->parent; } if (!timer_pending(&blkiolat->timer)) mod_timer(&blkiolat->timer, jiffies + HZ); } static void iolatency_record_time(struct iolatency_grp *iolat, struct bio_issue *issue, u64 now, bool issue_as_root) { u64 start = bio_issue_time(issue); u64 req_time; /* * Have to do this so we are truncated to the correct time that our * issue is truncated to. */ now = __bio_issue_time(now); if (now <= start) return; req_time = now - start; /* * We don't want to count issue_as_root bio's in the cgroups latency * statistics as it could skew the numbers downwards. */ if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) { u64 sub = iolat->min_lat_nsec; if (req_time < sub) blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time); return; } latency_stat_record_time(iolat, req_time); } #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) #define BLKIOLATENCY_MIN_GOOD_SAMPLES 5 static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) { struct blkcg_gq *blkg = lat_to_blkg(iolat); struct iolatency_grp *parent; struct child_latency_info *lat_info; struct latency_stat stat; unsigned long flags; int cpu; latency_stat_init(iolat, &stat); preempt_disable(); for_each_online_cpu(cpu) { struct latency_stat *s; s = per_cpu_ptr(iolat->stats, cpu); latency_stat_sum(iolat, &stat, s); latency_stat_init(iolat, s); } preempt_enable(); parent = blkg_to_lat(blkg->parent); if (!parent) return; lat_info = &parent->child_lat; iolat_update_total_lat_avg(iolat, &stat); /* Everything is ok and we don't need to adjust the scale. */ if (latency_sum_ok(iolat, &stat) && atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) return; /* Somebody beat us to the punch, just bail. */ spin_lock_irqsave(&lat_info->lock, flags); latency_stat_sum(iolat, &iolat->cur_stat, &stat); lat_info->nr_samples -= iolat->nr_samples; lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat); iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat); if ((lat_info->last_scale_event >= now || now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME)) goto out; if (latency_sum_ok(iolat, &iolat->cur_stat) && latency_sum_ok(iolat, &stat)) { if (latency_stat_samples(iolat, &iolat->cur_stat) < BLKIOLATENCY_MIN_GOOD_SAMPLES) goto out; if (lat_info->scale_grp == iolat) { lat_info->last_scale_event = now; scale_cookie_change(iolat->blkiolat, lat_info, true); } } else if (lat_info->scale_lat == 0 || lat_info->scale_lat >= iolat->min_lat_nsec) { lat_info->last_scale_event = now; if (!lat_info->scale_grp || lat_info->scale_lat > iolat->min_lat_nsec) { WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec); lat_info->scale_grp = iolat; } scale_cookie_change(iolat->blkiolat, lat_info, false); } latency_stat_init(iolat, &iolat->cur_stat); out: spin_unlock_irqrestore(&lat_info->lock, flags); } static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) { struct blkcg_gq *blkg; struct rq_wait *rqw; struct iolatency_grp *iolat; u64 window_start; u64 now; bool issue_as_root = bio_issue_as_root_blkg(bio); int inflight = 0; blkg = bio->bi_blkg; if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED)) return; iolat = blkg_to_lat(bio->bi_blkg); if (!iolat) return; if (!iolat->blkiolat->enabled) return; now = ktime_to_ns(ktime_get()); while (blkg && blkg->parent) { iolat = blkg_to_lat(blkg); if (!iolat) { blkg = blkg->parent; continue; } rqw = &iolat->rq_wait; inflight = atomic_dec_return(&rqw->inflight); WARN_ON_ONCE(inflight < 0); /* * If bi_status is BLK_STS_AGAIN, the bio wasn't actually * submitted, so do not account for it. */ if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) { iolatency_record_time(iolat, &bio->bi_issue, now, issue_as_root); window_start = atomic64_read(&iolat->window_start); if (now > window_start && (now - window_start) >= iolat->cur_win_nsec) { if (atomic64_cmpxchg(&iolat->window_start, window_start, now) == window_start) iolatency_check_latencies(iolat, now); } } wake_up(&rqw->wait); blkg = blkg->parent; } } static void blkcg_iolatency_exit(struct rq_qos *rqos) { struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); del_timer_sync(&blkiolat->timer); flush_work(&blkiolat->enable_work); blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); kfree(blkiolat); } static struct rq_qos_ops blkcg_iolatency_ops = { .throttle = blkcg_iolatency_throttle, .done_bio = blkcg_iolatency_done_bio, .exit = blkcg_iolatency_exit, }; static void blkiolatency_timer_fn(struct timer_list *t) { struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); struct blkcg_gq *blkg; struct cgroup_subsys_state *pos_css; u64 now = ktime_to_ns(ktime_get()); rcu_read_lock(); blkg_for_each_descendant_pre(blkg, pos_css, blkiolat->rqos.q->root_blkg) { struct iolatency_grp *iolat; struct child_latency_info *lat_info; unsigned long flags; u64 cookie; /* * We could be exiting, don't access the pd unless we have a * ref on the blkg. */ if (!blkg_tryget(blkg)) continue; iolat = blkg_to_lat(blkg); if (!iolat) goto next; lat_info = &iolat->child_lat; cookie = atomic_read(&lat_info->scale_cookie); if (cookie >= DEFAULT_SCALE_COOKIE) goto next; spin_lock_irqsave(&lat_info->lock, flags); if (lat_info->last_scale_event >= now) goto next_lock; /* * We scaled down but don't have a scale_grp, scale up and carry * on. */ if (lat_info->scale_grp == NULL) { scale_cookie_change(iolat->blkiolat, lat_info, true); goto next_lock; } /* * It's been 5 seconds since our last scale event, clear the * scale grp in case the group that needed the scale down isn't * doing any IO currently. */ if (now - lat_info->last_scale_event >= ((u64)NSEC_PER_SEC * 5)) lat_info->scale_grp = NULL; next_lock: spin_unlock_irqrestore(&lat_info->lock, flags); next: blkg_put(blkg); } rcu_read_unlock(); } /** * blkiolatency_enable_work_fn - Enable or disable iolatency on the device * @work: enable_work of the blk_iolatency of interest * * iolatency needs to keep track of the number of in-flight IOs per cgroup. This * is relatively expensive as it involves walking up the hierarchy twice for * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we * want to disable the in-flight tracking. * * We have to make sure that the counting is balanced - we don't want to leak * the in-flight counts by disabling accounting in the completion path while IOs * are in flight. This is achieved by ensuring that no IO is in flight by * freezing the queue while flipping ->enabled. As this requires a sleepable * context, ->enabled flipping is punted to this work function. */ static void blkiolatency_enable_work_fn(struct work_struct *work) { struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency, enable_work); bool enabled; /* * There can only be one instance of this function running for @blkiolat * and it's guaranteed to be executed at least once after the latest * ->enabled_cnt modification. Acting on the latest ->enable_cnt is * sufficient. * * Also, we know @blkiolat is safe to access as ->enable_work is flushed * in blkcg_iolatency_exit(). */ enabled = atomic_read(&blkiolat->enable_cnt); if (enabled != blkiolat->enabled) { blk_mq_freeze_queue(blkiolat->rqos.q); blkiolat->enabled = enabled; blk_mq_unfreeze_queue(blkiolat->rqos.q); } } int blk_iolatency_init(struct request_queue *q) { struct blk_iolatency *blkiolat; struct rq_qos *rqos; int ret; blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL); if (!blkiolat) return -ENOMEM; rqos = &blkiolat->rqos; rqos->id = RQ_QOS_LATENCY; rqos->ops = &blkcg_iolatency_ops; rqos->q = q; ret = rq_qos_add(q, rqos); if (ret) goto err_free; ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); if (ret) goto err_qos_del; timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); return 0; err_qos_del: rq_qos_del(q, rqos); err_free: kfree(blkiolat); return ret; } static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) { struct iolatency_grp *iolat = blkg_to_lat(blkg); struct blk_iolatency *blkiolat = iolat->blkiolat; u64 oldval = iolat->min_lat_nsec; iolat->min_lat_nsec = val; iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE); iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, BLKIOLATENCY_MAX_WIN_SIZE); if (!oldval && val) { if (atomic_inc_return(&blkiolat->enable_cnt) == 1) schedule_work(&blkiolat->enable_work); } if (oldval && !val) { blkcg_clear_delay(blkg); if (atomic_dec_return(&blkiolat->enable_cnt) == 0) schedule_work(&blkiolat->enable_work); } } static void iolatency_clear_scaling(struct blkcg_gq *blkg) { if (blkg->parent) { struct iolatency_grp *iolat = blkg_to_lat(blkg->parent); struct child_latency_info *lat_info; if (!iolat) return; lat_info = &iolat->child_lat; spin_lock(&lat_info->lock); atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE); lat_info->last_scale_event = 0; lat_info->scale_grp = NULL; lat_info->scale_lat = 0; spin_unlock(&lat_info->lock); } } static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct blkcg_gq *blkg; struct blkg_conf_ctx ctx; struct iolatency_grp *iolat; char *p, *tok; u64 lat_val = 0; u64 oldval; int ret; ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); if (ret) return ret; iolat = blkg_to_lat(ctx.blkg); p = ctx.body; ret = -EINVAL; while ((tok = strsep(&p, " "))) { char key[16]; char val[21]; /* 18446744073709551616 */ if (sscanf(tok, "%15[^=]=%20s", key, val) != 2) goto out; if (!strcmp(key, "target")) { u64 v; if (!strcmp(val, "max")) lat_val = 0; else if (sscanf(val, "%llu", &v) == 1) lat_val = v * NSEC_PER_USEC; else goto out; } else { goto out; } } /* Walk up the tree to see if our new val is lower than it should be. */ blkg = ctx.blkg; oldval = iolat->min_lat_nsec; iolatency_set_min_lat_nsec(blkg, lat_val); if (oldval != iolat->min_lat_nsec) iolatency_clear_scaling(blkg); ret = 0; out: blkg_conf_finish(&ctx); return ret ?: nbytes; } static u64 iolatency_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, int off) { struct iolatency_grp *iolat = pd_to_lat(pd); const char *dname = blkg_dev_name(pd->blkg); if (!dname || !iolat->min_lat_nsec) return 0; seq_printf(sf, "%s target=%llu\n", dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC)); return 0; } static int iolatency_print_limit(struct seq_file *sf, void *v) { blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), iolatency_prfill_limit, &blkcg_policy_iolatency, seq_cft(sf)->private, false); return 0; } static bool iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) { struct latency_stat stat; int cpu; latency_stat_init(iolat, &stat); preempt_disable(); for_each_online_cpu(cpu) { struct latency_stat *s; s = per_cpu_ptr(iolat->stats, cpu); latency_stat_sum(iolat, &stat, s); } preempt_enable(); if (iolat->rq_depth.max_depth == UINT_MAX) seq_printf(s, " missed=%llu total=%llu depth=max", (unsigned long long)stat.ps.missed, (unsigned long long)stat.ps.total); else seq_printf(s, " missed=%llu total=%llu depth=%u", (unsigned long long)stat.ps.missed, (unsigned long long)stat.ps.total, iolat->rq_depth.max_depth); return true; } static bool iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct iolatency_grp *iolat = pd_to_lat(pd); unsigned long long avg_lat; unsigned long long cur_win; if (!blkcg_debug_stats) return false; if (iolat->ssd) return iolatency_ssd_stat(iolat, s); avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); if (iolat->rq_depth.max_depth == UINT_MAX) seq_printf(s, " depth=max avg_lat=%llu win=%llu", avg_lat, cur_win); else seq_printf(s, " depth=%u avg_lat=%llu win=%llu", iolat->rq_depth.max_depth, avg_lat, cur_win); return true; } static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, struct request_queue *q, struct blkcg *blkcg) { struct iolatency_grp *iolat; iolat = kzalloc_node(sizeof(*iolat), gfp, q->node); if (!iolat) return NULL; iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), __alignof__(struct latency_stat), gfp); if (!iolat->stats) { kfree(iolat); return NULL; } return &iolat->pd; } static void iolatency_pd_init(struct blkg_policy_data *pd) { struct iolatency_grp *iolat = pd_to_lat(pd); struct blkcg_gq *blkg = lat_to_blkg(iolat); struct rq_qos *rqos = blkcg_rq_qos(blkg->q); struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); u64 now = ktime_to_ns(ktime_get()); int cpu; if (blk_queue_nonrot(blkg->q)) iolat->ssd = true; else iolat->ssd = false; for_each_possible_cpu(cpu) { struct latency_stat *stat; stat = per_cpu_ptr(iolat->stats, cpu); latency_stat_init(iolat, stat); } latency_stat_init(iolat, &iolat->cur_stat); rq_wait_init(&iolat->rq_wait); spin_lock_init(&iolat->child_lat.lock); iolat->rq_depth.queue_depth = blkg->q->nr_requests; iolat->rq_depth.max_depth = UINT_MAX; iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; iolat->blkiolat = blkiolat; iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; atomic64_set(&iolat->window_start, now); /* * We init things in list order, so the pd for the parent may not be * init'ed yet for whatever reason. */ if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) { struct iolatency_grp *parent = blkg_to_lat(blkg->parent); atomic_set(&iolat->scale_cookie, atomic_read(&parent->child_lat.scale_cookie)); } else { atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE); } atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE); } static void iolatency_pd_offline(struct blkg_policy_data *pd) { struct iolatency_grp *iolat = pd_to_lat(pd); struct blkcg_gq *blkg = lat_to_blkg(iolat); iolatency_set_min_lat_nsec(blkg, 0); iolatency_clear_scaling(blkg); } static void iolatency_pd_free(struct blkg_policy_data *pd) { struct iolatency_grp *iolat = pd_to_lat(pd); free_percpu(iolat->stats); kfree(iolat); } static struct cftype iolatency_files[] = { { .name = "latency", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = iolatency_print_limit, .write = iolatency_set_limit, }, {} }; static struct blkcg_policy blkcg_policy_iolatency = { .dfl_cftypes = iolatency_files, .pd_alloc_fn = iolatency_pd_alloc, .pd_init_fn = iolatency_pd_init, .pd_offline_fn = iolatency_pd_offline, .pd_free_fn = iolatency_pd_free, .pd_stat_fn = iolatency_pd_stat, }; static int __init iolatency_init(void) { return blkcg_policy_register(&blkcg_policy_iolatency); } static void __exit iolatency_exit(void) { blkcg_policy_unregister(&blkcg_policy_iolatency); } module_init(iolatency_init); module_exit(iolatency_exit);
427 421 427 425 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 // SPDX-License-Identifier: GPL-2.0-only /* * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. * * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling * Copyright (C) 2000-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/module.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/route.h> #include <linux/ip.h> #include <net/ip.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team <coreteam@netfilter.org>"); MODULE_DESCRIPTION("iptables mangle table"); #define MANGLE_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | \ (1 << NF_INET_LOCAL_IN) | \ (1 << NF_INET_FORWARD) | \ (1 << NF_INET_LOCAL_OUT) | \ (1 << NF_INET_POST_ROUTING)) static const struct xt_table packet_mangler = { .name = "mangle", .valid_hooks = MANGLE_VALID_HOOKS, .me = THIS_MODULE, .af = NFPROTO_IPV4, .priority = NF_IP_PRI_MANGLE, }; static unsigned int ipt_mangle_out(struct sk_buff *skb, const struct nf_hook_state *state, void *priv) { unsigned int ret; const struct iphdr *iph; u_int8_t tos; __be32 saddr, daddr; u_int32_t mark; int err; /* Save things which could affect route */ mark = skb->mark; iph = ip_hdr(skb); saddr = iph->saddr; daddr = iph->daddr; tos = iph->tos; ret = ipt_do_table(skb, state, priv); /* Reroute for ANY change. */ if (ret != NF_DROP && ret != NF_STOLEN) { iph = ip_hdr(skb); if (iph->saddr != saddr || iph->daddr != daddr || skb->mark != mark || iph->tos != tos) { err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } } return ret; } /* The work comes in here from netfilter.c. */ static unsigned int iptable_mangle_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { if (state->hook == NF_INET_LOCAL_OUT) return ipt_mangle_out(skb, state, priv); return ipt_do_table(skb, state, priv); } static struct nf_hook_ops *mangle_ops __read_mostly; static int iptable_mangle_table_init(struct net *net) { struct ipt_replace *repl; int ret; repl = ipt_alloc_initial_table(&packet_mangler); if (repl == NULL) return -ENOMEM; ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops); kfree(repl); return ret; } static void __net_exit iptable_mangle_net_pre_exit(struct net *net) { ipt_unregister_table_pre_exit(net, "mangle"); } static void __net_exit iptable_mangle_net_exit(struct net *net) { ipt_unregister_table_exit(net, "mangle"); } static struct pernet_operations iptable_mangle_net_ops = { .pre_exit = iptable_mangle_net_pre_exit, .exit = iptable_mangle_net_exit, }; static int __init iptable_mangle_init(void) { int ret = xt_register_template(&packet_mangler, iptable_mangle_table_init); if (ret < 0) return ret; mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook); if (IS_ERR(mangle_ops)) { xt_unregister_template(&packet_mangler); ret = PTR_ERR(mangle_ops); return ret; } ret = register_pernet_subsys(&iptable_mangle_net_ops); if (ret < 0) { xt_unregister_template(&packet_mangler); kfree(mangle_ops); return ret; } return ret; } static void __exit iptable_mangle_fini(void) { unregister_pernet_subsys(&iptable_mangle_net_ops); xt_unregister_template(&packet_mangler); kfree(mangle_ops); } module_init(iptable_mangle_init); module_exit(iptable_mangle_fini);
509 1 2226 107 456 2683 456 457 333 2685 2693 2226 1777 213 456 454 456 456 2685 2684 2595 107 2683 2688 2681 440 126 326 277 441 65 64 65 64 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/file.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* * Mapping table from "enum tomoyo_path_acl_index" to "enum tomoyo_mac_index". */ static const u8 tomoyo_p2mac[TOMOYO_MAX_PATH_OPERATION] = { [TOMOYO_TYPE_EXECUTE] = TOMOYO_MAC_FILE_EXECUTE, [TOMOYO_TYPE_READ] = TOMOYO_MAC_FILE_OPEN, [TOMOYO_TYPE_WRITE] = TOMOYO_MAC_FILE_OPEN, [TOMOYO_TYPE_APPEND] = TOMOYO_MAC_FILE_OPEN, [TOMOYO_TYPE_UNLINK] = TOMOYO_MAC_FILE_UNLINK, [TOMOYO_TYPE_GETATTR] = TOMOYO_MAC_FILE_GETATTR, [TOMOYO_TYPE_RMDIR] = TOMOYO_MAC_FILE_RMDIR, [TOMOYO_TYPE_TRUNCATE] = TOMOYO_MAC_FILE_TRUNCATE, [TOMOYO_TYPE_SYMLINK] = TOMOYO_MAC_FILE_SYMLINK, [TOMOYO_TYPE_CHROOT] = TOMOYO_MAC_FILE_CHROOT, [TOMOYO_TYPE_UMOUNT] = TOMOYO_MAC_FILE_UMOUNT, }; /* * Mapping table from "enum tomoyo_mkdev_acl_index" to "enum tomoyo_mac_index". */ const u8 tomoyo_pnnn2mac[TOMOYO_MAX_MKDEV_OPERATION] = { [TOMOYO_TYPE_MKBLOCK] = TOMOYO_MAC_FILE_MKBLOCK, [TOMOYO_TYPE_MKCHAR] = TOMOYO_MAC_FILE_MKCHAR, }; /* * Mapping table from "enum tomoyo_path2_acl_index" to "enum tomoyo_mac_index". */ const u8 tomoyo_pp2mac[TOMOYO_MAX_PATH2_OPERATION] = { [TOMOYO_TYPE_LINK] = TOMOYO_MAC_FILE_LINK, [TOMOYO_TYPE_RENAME] = TOMOYO_MAC_FILE_RENAME, [TOMOYO_TYPE_PIVOT_ROOT] = TOMOYO_MAC_FILE_PIVOT_ROOT, }; /* * Mapping table from "enum tomoyo_path_number_acl_index" to * "enum tomoyo_mac_index". */ const u8 tomoyo_pn2mac[TOMOYO_MAX_PATH_NUMBER_OPERATION] = { [TOMOYO_TYPE_CREATE] = TOMOYO_MAC_FILE_CREATE, [TOMOYO_TYPE_MKDIR] = TOMOYO_MAC_FILE_MKDIR, [TOMOYO_TYPE_MKFIFO] = TOMOYO_MAC_FILE_MKFIFO, [TOMOYO_TYPE_MKSOCK] = TOMOYO_MAC_FILE_MKSOCK, [TOMOYO_TYPE_IOCTL] = TOMOYO_MAC_FILE_IOCTL, [TOMOYO_TYPE_CHMOD] = TOMOYO_MAC_FILE_CHMOD, [TOMOYO_TYPE_CHOWN] = TOMOYO_MAC_FILE_CHOWN, [TOMOYO_TYPE_CHGRP] = TOMOYO_MAC_FILE_CHGRP, }; /** * tomoyo_put_name_union - Drop reference on "struct tomoyo_name_union". * * @ptr: Pointer to "struct tomoyo_name_union". * * Returns nothing. */ void tomoyo_put_name_union(struct tomoyo_name_union *ptr) { tomoyo_put_group(ptr->group); tomoyo_put_name(ptr->filename); } /** * tomoyo_compare_name_union - Check whether a name matches "struct tomoyo_name_union" or not. * * @name: Pointer to "struct tomoyo_path_info". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns "struct tomoyo_path_info" if @name matches @ptr, NULL otherwise. */ const struct tomoyo_path_info * tomoyo_compare_name_union(const struct tomoyo_path_info *name, const struct tomoyo_name_union *ptr) { if (ptr->group) return tomoyo_path_matches_group(name, ptr->group); if (tomoyo_path_matches_pattern(name, ptr->filename)) return ptr->filename; return NULL; } /** * tomoyo_put_number_union - Drop reference on "struct tomoyo_number_union". * * @ptr: Pointer to "struct tomoyo_number_union". * * Returns nothing. */ void tomoyo_put_number_union(struct tomoyo_number_union *ptr) { tomoyo_put_group(ptr->group); } /** * tomoyo_compare_number_union - Check whether a value matches "struct tomoyo_number_union" or not. * * @value: Number to check. * @ptr: Pointer to "struct tomoyo_number_union". * * Returns true if @value matches @ptr, false otherwise. */ bool tomoyo_compare_number_union(const unsigned long value, const struct tomoyo_number_union *ptr) { if (ptr->group) return tomoyo_number_matches_group(value, value, ptr->group); return value >= ptr->values[0] && value <= ptr->values[1]; } /** * tomoyo_add_slash - Add trailing '/' if needed. * * @buf: Pointer to "struct tomoyo_path_info". * * Returns nothing. * * @buf must be generated by tomoyo_encode() because this function does not * allocate memory for adding '/'. */ static void tomoyo_add_slash(struct tomoyo_path_info *buf) { if (buf->is_dir) return; /* * This is OK because tomoyo_encode() reserves space for appending "/". */ strcat((char *) buf->name, "/"); tomoyo_fill_path_info(buf); } /** * tomoyo_get_realpath - Get realpath. * * @buf: Pointer to "struct tomoyo_path_info". * @path: Pointer to "struct path". * * Returns true on success, false otherwise. */ static bool tomoyo_get_realpath(struct tomoyo_path_info *buf, const struct path *path) { buf->name = tomoyo_realpath_from_path(path); if (buf->name) { tomoyo_fill_path_info(buf); return true; } return false; } /** * tomoyo_audit_path_log - Audit path request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_path_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file %s %s\n", tomoyo_path_keyword [r->param.path.operation], r->param.path.filename->name); } /** * tomoyo_audit_path2_log - Audit path/path request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_path2_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords [tomoyo_pp2mac[r->param.path2.operation]], r->param.path2.filename1->name, r->param.path2.filename2->name); } /** * tomoyo_audit_mkdev_log - Audit path/number/number/number request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_mkdev_log(struct tomoyo_request_info *r) { return tomoyo_supervisor(r, "file %s %s 0%o %u %u\n", tomoyo_mac_keywords [tomoyo_pnnn2mac[r->param.mkdev.operation]], r->param.mkdev.filename->name, r->param.mkdev.mode, r->param.mkdev.major, r->param.mkdev.minor); } /** * tomoyo_audit_path_number_log - Audit path/number request log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_path_number_log(struct tomoyo_request_info *r) { const u8 type = r->param.path_number.operation; u8 radix; char buffer[64]; switch (type) { case TOMOYO_TYPE_CREATE: case TOMOYO_TYPE_MKDIR: case TOMOYO_TYPE_MKFIFO: case TOMOYO_TYPE_MKSOCK: case TOMOYO_TYPE_CHMOD: radix = TOMOYO_VALUE_TYPE_OCTAL; break; case TOMOYO_TYPE_IOCTL: radix = TOMOYO_VALUE_TYPE_HEXADECIMAL; break; default: radix = TOMOYO_VALUE_TYPE_DECIMAL; break; } tomoyo_print_ulong(buffer, sizeof(buffer), r->param.path_number.number, radix); return tomoyo_supervisor(r, "file %s %s %s\n", tomoyo_mac_keywords [tomoyo_pn2mac[type]], r->param.path_number.filename->name, buffer); } /** * tomoyo_check_path_acl - Check permission for path operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. * * To be able to use wildcard for domain transition, this function sets * matching entry on success. Since the caller holds tomoyo_read_lock(), * it is safe to set matching entry. */ static bool tomoyo_check_path_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_path_acl *acl = container_of(ptr, typeof(*acl), head); if (acl->perm & (1 << r->param.path.operation)) { r->param.path.matched_path = tomoyo_compare_name_union(r->param.path.filename, &acl->name); return r->param.path.matched_path != NULL; } return false; } /** * tomoyo_check_path_number_acl - Check permission for path number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_path_number_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_path_number_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.path_number.operation)) && tomoyo_compare_number_union(r->param.path_number.number, &acl->number) && tomoyo_compare_name_union(r->param.path_number.filename, &acl->name); } /** * tomoyo_check_path2_acl - Check permission for path path operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_path2_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_path2_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.path2.operation)) && tomoyo_compare_name_union(r->param.path2.filename1, &acl->name1) && tomoyo_compare_name_union(r->param.path2.filename2, &acl->name2); } /** * tomoyo_check_mkdev_acl - Check permission for path number number number operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_mkdev_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_mkdev_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.mkdev.operation)) && tomoyo_compare_number_union(r->param.mkdev.mode, &acl->mode) && tomoyo_compare_number_union(r->param.mkdev.major, &acl->major) && tomoyo_compare_number_union(r->param.mkdev.minor, &acl->minor) && tomoyo_compare_name_union(r->param.mkdev.filename, &acl->name); } /** * tomoyo_same_path_acl - Check for duplicated "struct tomoyo_path_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_path_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_path_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_path_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name, &p2->name); } /** * tomoyo_merge_path_acl - Merge duplicated "struct tomoyo_path_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_path_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u16 * const a_perm = &container_of(a, struct tomoyo_path_acl, head) ->perm; u16 perm = READ_ONCE(*a_perm); const u16 b_perm = container_of(b, struct tomoyo_path_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_path_acl - Update "struct tomoyo_path_acl" list. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_path_acl(const u16 perm, struct tomoyo_acl_param *param) { struct tomoyo_path_acl e = { .head.type = TOMOYO_TYPE_PATH_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_path_acl, tomoyo_merge_path_acl); tomoyo_put_name_union(&e.name); return error; } /** * tomoyo_same_mkdev_acl - Check for duplicated "struct tomoyo_mkdev_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_mkdev_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_mkdev_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_mkdev_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name, &p2->name) && tomoyo_same_number_union(&p1->mode, &p2->mode) && tomoyo_same_number_union(&p1->major, &p2->major) && tomoyo_same_number_union(&p1->minor, &p2->minor); } /** * tomoyo_merge_mkdev_acl - Merge duplicated "struct tomoyo_mkdev_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_mkdev_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 *const a_perm = &container_of(a, struct tomoyo_mkdev_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_mkdev_acl, head) ->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_mkdev_acl - Update "struct tomoyo_mkdev_acl" list. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_mkdev_acl(const u8 perm, struct tomoyo_acl_param *param) { struct tomoyo_mkdev_acl e = { .head.type = TOMOYO_TYPE_MKDEV_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name) || !tomoyo_parse_number_union(param, &e.mode) || !tomoyo_parse_number_union(param, &e.major) || !tomoyo_parse_number_union(param, &e.minor)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_mkdev_acl, tomoyo_merge_mkdev_acl); tomoyo_put_name_union(&e.name); tomoyo_put_number_union(&e.mode); tomoyo_put_number_union(&e.major); tomoyo_put_number_union(&e.minor); return error; } /** * tomoyo_same_path2_acl - Check for duplicated "struct tomoyo_path2_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_path2_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_path2_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_path2_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name1, &p2->name1) && tomoyo_same_name_union(&p1->name2, &p2->name2); } /** * tomoyo_merge_path2_acl - Merge duplicated "struct tomoyo_path2_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_path2_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_path2_acl, head) ->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_path2_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_path2_acl - Update "struct tomoyo_path2_acl" list. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_path2_acl(const u8 perm, struct tomoyo_acl_param *param) { struct tomoyo_path2_acl e = { .head.type = TOMOYO_TYPE_PATH2_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name1) || !tomoyo_parse_name_union(param, &e.name2)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_path2_acl, tomoyo_merge_path2_acl); tomoyo_put_name_union(&e.name1); tomoyo_put_name_union(&e.name2); return error; } /** * tomoyo_path_permission - Check permission for single path operation. * * @r: Pointer to "struct tomoyo_request_info". * @operation: Type of operation. * @filename: Filename to check. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_path_permission(struct tomoyo_request_info *r, u8 operation, const struct tomoyo_path_info *filename) { int error; r->type = tomoyo_p2mac[operation]; r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type); if (r->mode == TOMOYO_CONFIG_DISABLED) return 0; r->param_type = TOMOYO_TYPE_PATH_ACL; r->param.path.filename = filename; r->param.path.operation = operation; do { tomoyo_check_acl(r, tomoyo_check_path_acl); error = tomoyo_audit_path_log(r); } while (error == TOMOYO_RETRY_REQUEST); return error; } /** * tomoyo_execute_permission - Check permission for execute operation. * * @r: Pointer to "struct tomoyo_request_info". * @filename: Filename to check. * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_execute_permission(struct tomoyo_request_info *r, const struct tomoyo_path_info *filename) { /* * Unlike other permission checks, this check is done regardless of * profile mode settings in order to check for domain transition * preference. */ r->type = TOMOYO_MAC_FILE_EXECUTE; r->mode = tomoyo_get_mode(r->domain->ns, r->profile, r->type); r->param_type = TOMOYO_TYPE_PATH_ACL; r->param.path.filename = filename; r->param.path.operation = TOMOYO_TYPE_EXECUTE; tomoyo_check_acl(r, tomoyo_check_path_acl); r->ee->transition = r->matched_acl && r->matched_acl->cond ? r->matched_acl->cond->transit : NULL; if (r->mode != TOMOYO_CONFIG_DISABLED) return tomoyo_audit_path_log(r); return 0; } /** * tomoyo_same_path_number_acl - Check for duplicated "struct tomoyo_path_number_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_path_number_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_path_number_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_path_number_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->name, &p2->name) && tomoyo_same_number_union(&p1->number, &p2->number); } /** * tomoyo_merge_path_number_acl - Merge duplicated "struct tomoyo_path_number_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_path_number_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_path_number_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_path_number_acl, head) ->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_update_path_number_acl - Update ioctl/chmod/chown/chgrp ACL. * * @perm: Permission. * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_update_path_number_acl(const u8 perm, struct tomoyo_acl_param *param) { struct tomoyo_path_number_acl e = { .head.type = TOMOYO_TYPE_PATH_NUMBER_ACL, .perm = perm }; int error; if (!tomoyo_parse_name_union(param, &e.name) || !tomoyo_parse_number_union(param, &e.number)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_path_number_acl, tomoyo_merge_path_number_acl); tomoyo_put_name_union(&e.name); tomoyo_put_number_union(&e.number); return error; } /** * tomoyo_path_number_perm - Check permission for "create", "mkdir", "mkfifo", "mksock", "ioctl", "chmod", "chown", "chgrp". * * @type: Type of operation. * @path: Pointer to "struct path". * @number: Number. * * Returns 0 on success, negative value otherwise. */ int tomoyo_path_number_perm(const u8 type, const struct path *path, unsigned long number) { struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int error = -ENOMEM; struct tomoyo_path_info buf; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_pn2mac[type]) == TOMOYO_CONFIG_DISABLED || !path->dentry) return 0; idx = tomoyo_read_lock(); if (!tomoyo_get_realpath(&buf, path)) goto out; r.obj = &obj; if (type == TOMOYO_TYPE_MKDIR) tomoyo_add_slash(&buf); r.param_type = TOMOYO_TYPE_PATH_NUMBER_ACL; r.param.path_number.operation = type; r.param.path_number.filename = &buf; r.param.path_number.number = number; do { tomoyo_check_acl(&r, tomoyo_check_path_number_acl); error = tomoyo_audit_path_number_log(&r); } while (error == TOMOYO_RETRY_REQUEST); kfree(buf.name); out: tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_check_open_permission - Check permission for "read" and "write". * * @domain: Pointer to "struct tomoyo_domain_info". * @path: Pointer to "struct path". * @flag: Flags for open(). * * Returns 0 on success, negative value otherwise. */ int tomoyo_check_open_permission(struct tomoyo_domain_info *domain, const struct path *path, const int flag) { const u8 acc_mode = ACC_MODE(flag); int error = 0; struct tomoyo_path_info buf; struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int idx; buf.name = NULL; r.mode = TOMOYO_CONFIG_DISABLED; idx = tomoyo_read_lock(); if (acc_mode && tomoyo_init_request_info(&r, domain, TOMOYO_MAC_FILE_OPEN) != TOMOYO_CONFIG_DISABLED) { if (!tomoyo_get_realpath(&buf, path)) { error = -ENOMEM; goto out; } r.obj = &obj; if (acc_mode & MAY_READ) error = tomoyo_path_permission(&r, TOMOYO_TYPE_READ, &buf); if (!error && (acc_mode & MAY_WRITE)) error = tomoyo_path_permission(&r, (flag & O_APPEND) ? TOMOYO_TYPE_APPEND : TOMOYO_TYPE_WRITE, &buf); } out: kfree(buf.name); tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_path_perm - Check permission for "unlink", "rmdir", "truncate", "symlink", "append", "chroot" and "unmount". * * @operation: Type of operation. * @path: Pointer to "struct path". * @target: Symlink's target if @operation is TOMOYO_TYPE_SYMLINK, * NULL otherwise. * * Returns 0 on success, negative value otherwise. */ int tomoyo_path_perm(const u8 operation, const struct path *path, const char *target) { struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int error; struct tomoyo_path_info buf; bool is_enforce; struct tomoyo_path_info symlink_target; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_p2mac[operation]) == TOMOYO_CONFIG_DISABLED) return 0; is_enforce = (r.mode == TOMOYO_CONFIG_ENFORCING); error = -ENOMEM; buf.name = NULL; idx = tomoyo_read_lock(); if (!tomoyo_get_realpath(&buf, path)) goto out; r.obj = &obj; switch (operation) { case TOMOYO_TYPE_RMDIR: case TOMOYO_TYPE_CHROOT: tomoyo_add_slash(&buf); break; case TOMOYO_TYPE_SYMLINK: symlink_target.name = tomoyo_encode(target); if (!symlink_target.name) goto out; tomoyo_fill_path_info(&symlink_target); obj.symlink_target = &symlink_target; break; } error = tomoyo_path_permission(&r, operation, &buf); if (operation == TOMOYO_TYPE_SYMLINK) kfree(symlink_target.name); out: kfree(buf.name); tomoyo_read_unlock(idx); if (!is_enforce) error = 0; return error; } /** * tomoyo_mkdev_perm - Check permission for "mkblock" and "mkchar". * * @operation: Type of operation. (TOMOYO_TYPE_MKCHAR or TOMOYO_TYPE_MKBLOCK) * @path: Pointer to "struct path". * @mode: Create mode. * @dev: Device number. * * Returns 0 on success, negative value otherwise. */ int tomoyo_mkdev_perm(const u8 operation, const struct path *path, const unsigned int mode, unsigned int dev) { struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path->mnt, .dentry = path->dentry }, }; int error = -ENOMEM; struct tomoyo_path_info buf; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_pnnn2mac[operation]) == TOMOYO_CONFIG_DISABLED) return 0; idx = tomoyo_read_lock(); error = -ENOMEM; if (tomoyo_get_realpath(&buf, path)) { r.obj = &obj; dev = new_decode_dev(dev); r.param_type = TOMOYO_TYPE_MKDEV_ACL; r.param.mkdev.filename = &buf; r.param.mkdev.operation = operation; r.param.mkdev.mode = mode; r.param.mkdev.major = MAJOR(dev); r.param.mkdev.minor = MINOR(dev); tomoyo_check_acl(&r, tomoyo_check_mkdev_acl); error = tomoyo_audit_mkdev_log(&r); kfree(buf.name); } tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_path2_perm - Check permission for "rename", "link" and "pivot_root". * * @operation: Type of operation. * @path1: Pointer to "struct path". * @path2: Pointer to "struct path". * * Returns 0 on success, negative value otherwise. */ int tomoyo_path2_perm(const u8 operation, const struct path *path1, const struct path *path2) { int error = -ENOMEM; struct tomoyo_path_info buf1; struct tomoyo_path_info buf2; struct tomoyo_request_info r; struct tomoyo_obj_info obj = { .path1 = { .mnt = path1->mnt, .dentry = path1->dentry }, .path2 = { .mnt = path2->mnt, .dentry = path2->dentry } }; int idx; if (tomoyo_init_request_info(&r, NULL, tomoyo_pp2mac[operation]) == TOMOYO_CONFIG_DISABLED) return 0; buf1.name = NULL; buf2.name = NULL; idx = tomoyo_read_lock(); if (!tomoyo_get_realpath(&buf1, path1) || !tomoyo_get_realpath(&buf2, path2)) goto out; switch (operation) { case TOMOYO_TYPE_RENAME: case TOMOYO_TYPE_LINK: if (!d_is_dir(path1->dentry)) break; fallthrough; case TOMOYO_TYPE_PIVOT_ROOT: tomoyo_add_slash(&buf1); tomoyo_add_slash(&buf2); break; } r.obj = &obj; r.param_type = TOMOYO_TYPE_PATH2_ACL; r.param.path2.operation = operation; r.param.path2.filename1 = &buf1; r.param.path2.filename2 = &buf2; do { tomoyo_check_acl(&r, tomoyo_check_path2_acl); error = tomoyo_audit_path2_log(&r); } while (error == TOMOYO_RETRY_REQUEST); out: kfree(buf1.name); kfree(buf2.name); tomoyo_read_unlock(idx); if (r.mode != TOMOYO_CONFIG_ENFORCING) error = 0; return error; } /** * tomoyo_same_mount_acl - Check for duplicated "struct tomoyo_mount_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b, false otherwise. */ static bool tomoyo_same_mount_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_mount_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_mount_acl *p2 = container_of(b, typeof(*p2), head); return tomoyo_same_name_union(&p1->dev_name, &p2->dev_name) && tomoyo_same_name_union(&p1->dir_name, &p2->dir_name) && tomoyo_same_name_union(&p1->fs_type, &p2->fs_type) && tomoyo_same_number_union(&p1->flags, &p2->flags); } /** * tomoyo_update_mount_acl - Write "struct tomoyo_mount_acl" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ static int tomoyo_update_mount_acl(struct tomoyo_acl_param *param) { struct tomoyo_mount_acl e = { .head.type = TOMOYO_TYPE_MOUNT_ACL }; int error; if (!tomoyo_parse_name_union(param, &e.dev_name) || !tomoyo_parse_name_union(param, &e.dir_name) || !tomoyo_parse_name_union(param, &e.fs_type) || !tomoyo_parse_number_union(param, &e.flags)) error = -EINVAL; else error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_mount_acl, NULL); tomoyo_put_name_union(&e.dev_name); tomoyo_put_name_union(&e.dir_name); tomoyo_put_name_union(&e.fs_type); tomoyo_put_number_union(&e.flags); return error; } /** * tomoyo_write_file - Update file related list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_write_file(struct tomoyo_acl_param *param) { u16 perm = 0; u8 type; const char *operation = tomoyo_read_token(param); for (type = 0; type < TOMOYO_MAX_PATH_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_path_keyword[type])) perm |= 1 << type; if (perm) return tomoyo_update_path_acl(perm, param); for (type = 0; type < TOMOYO_MAX_PATH2_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_mac_keywords[tomoyo_pp2mac[type]])) perm |= 1 << type; if (perm) return tomoyo_update_path2_acl(perm, param); for (type = 0; type < TOMOYO_MAX_PATH_NUMBER_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_mac_keywords[tomoyo_pn2mac[type]])) perm |= 1 << type; if (perm) return tomoyo_update_path_number_acl(perm, param); for (type = 0; type < TOMOYO_MAX_MKDEV_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_mac_keywords[tomoyo_pnnn2mac[type]])) perm |= 1 << type; if (perm) return tomoyo_update_mkdev_acl(perm, param); if (tomoyo_permstr(operation, tomoyo_mac_keywords[TOMOYO_MAC_FILE_MOUNT])) return tomoyo_update_mount_acl(param); return -EINVAL; }
4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 // SPDX-License-Identifier: GPL-2.0-only /* * kexec.c - kexec system call core code. * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/fs.h> #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/highmem.h> #include <linux/syscalls.h> #include <linux/reboot.h> #include <linux/ioport.h> #include <linux/hardirq.h> #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/utsname.h> #include <linux/numa.h> #include <linux/suspend.h> #include <linux/device.h> #include <linux/freezer.h> #include <linux/panic_notifier.h> #include <linux/pm.h> #include <linux/cpu.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/console.h> #include <linux/vmalloc.h> #include <linux/swap.h> #include <linux/syscore_ops.h> #include <linux/compiler.h> #include <linux/hugetlb.h> #include <linux/objtool.h> #include <linux/kmsg_dump.h> #include <asm/page.h> #include <asm/sections.h> #include <crypto/hash.h> #include "kexec_internal.h" atomic_t __kexec_lock = ATOMIC_INIT(0); /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", .start = 0, .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, .desc = IORES_DESC_CRASH_KERNEL }; struct resource crashk_low_res = { .name = "Crash kernel", .start = 0, .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, .desc = IORES_DESC_CRASH_KERNEL }; int kexec_should_crash(struct task_struct *p) { /* * If crash_kexec_post_notifiers is enabled, don't run * crash_kexec() here yet, which must be run after panic * notifiers in panic(). */ if (crash_kexec_post_notifiers) return 0; /* * There are 4 panic() calls in do_exit() path, each of which * corresponds to each of these 4 conditions. */ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) return 1; return 0; } int kexec_crash_loaded(void) { return !!kexec_crash_image; } EXPORT_SYMBOL_GPL(kexec_crash_loaded); /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors * where you can disable the MMU this is trivial, and easy. For * others it is still a simple predictable page table to setup. * * In that environment kexec copies the new kernel to its final * resting place. This means I can only support memory whose * physical address can fit in an unsigned long. In particular * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. * If the assembly stub has more restrictive requirements * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be * defined more restrictively in <asm/kexec.h>. * * The code for the transition from the current kernel to the * new kernel is placed in the control_code_buffer, whose size * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from * virtual to physical addresses it must live in the range * 0 - TASK_SIZE, as only the user space mappings are arbitrarily * modifiable. * * The assembly stub in the control code buffer is passed a linked list * of descriptor pages detailing the source pages of the new kernel, * and the destination addresses of those source pages. As this data * structure is not used in the context of the current OS, it must * be self-contained. * * The code has been made to work with highmem pages and will use a * destination page in its final resting place (if it happens * to allocate it). The end product of this is that most of the * physical address space, and most of RAM can be used. * * Future directions include: * - allocating a page table with the control code buffer identity * mapped, to simplify machine_kexec and make kexec_on_panic more * reliable. */ /* * KIMAGE_NO_DEST is an impossible destination address..., for * allocating pages whose destination address we do not care about. */ #define KIMAGE_NO_DEST (-1UL) #define PAGE_COUNT(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); int sanity_check_segment_list(struct kimage *image) { int i; unsigned long nr_segments = image->nr_segments; unsigned long total_pages = 0; unsigned long nr_pages = totalram_pages(); /* * Verify we have good destination addresses. The caller is * responsible for making certain we don't attempt to load * the new image into invalid or reserved areas of RAM. This * just verifies it is an address we can use. * * Since the kernel does everything in page size chunks ensure * the destination addresses are page aligned. Too many * special cases crop of when we don't do this. The most * insidious is getting overlapping destination addresses * simply because addresses are changed to page size * granularity. */ for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if (mstart > mend) return -EADDRNOTAVAIL; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) return -EADDRNOTAVAIL; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) return -EADDRNOTAVAIL; } /* Verify our destination addresses do not overlap. * If we alloed overlapping destination addresses * through very weird things can happen with no * easy explanation as one segment stops on another. */ for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; unsigned long j; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; for (j = 0; j < i; j++) { unsigned long pstart, pend; pstart = image->segment[j].mem; pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) return -EINVAL; } } /* Ensure our buffer sizes are strictly less than * our memory sizes. This should always be the case, * and it is easier to check up front than to be surprised * later on. */ for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) return -EINVAL; } /* * Verify that no more than half of memory will be consumed. If the * request from userspace is too large, a large amount of time will be * wasted allocating pages, which can cause a soft lockup. */ for (i = 0; i < nr_segments; i++) { if (PAGE_COUNT(image->segment[i].memsz) > nr_pages / 2) return -EINVAL; total_pages += PAGE_COUNT(image->segment[i].memsz); } if (total_pages > nr_pages / 2) return -EINVAL; /* * Verify we have good destination addresses. Normally * the caller is responsible for making certain we don't * attempt to load the new image into invalid or reserved * areas of RAM. But crash kernels are preloaded into a * reserved area of ram. We must ensure the addresses * are in the reserved area otherwise preloading the * kernel could corrupt things. */ if (image->type == KEXEC_TYPE_CRASH) { for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ if ((mstart < phys_to_boot_phys(crashk_res.start)) || (mend > phys_to_boot_phys(crashk_res.end))) return -EADDRNOTAVAIL; } } return 0; } struct kimage *do_kimage_alloc_init(void) { struct kimage *image; /* Allocate a controlling structure */ image = kzalloc(sizeof(*image), GFP_KERNEL); if (!image) return NULL; image->head = 0; image->entry = &image->head; image->last_entry = &image->head; image->control_page = ~0; /* By default this does not apply */ image->type = KEXEC_TYPE_DEFAULT; /* Initialize the list of control pages */ INIT_LIST_HEAD(&image->control_pages); /* Initialize the list of destination pages */ INIT_LIST_HEAD(&image->dest_pages); /* Initialize the list of unusable pages */ INIT_LIST_HEAD(&image->unusable_pages); return image; } int kimage_is_destination_range(struct kimage *image, unsigned long start, unsigned long end) { unsigned long i; for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((end > mstart) && (start < mend)) return 1; } return 0; } static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; if (fatal_signal_pending(current)) return NULL; pages = alloc_pages(gfp_mask & ~__GFP_ZERO, order); if (pages) { unsigned int count, i; pages->mapping = NULL; set_page_private(pages, order); count = 1 << order; for (i = 0; i < count; i++) SetPageReserved(pages + i); arch_kexec_post_alloc_pages(page_address(pages), count, gfp_mask); if (gfp_mask & __GFP_ZERO) for (i = 0; i < count; i++) clear_highpage(pages + i); } return pages; } static void kimage_free_pages(struct page *page) { unsigned int order, count, i; order = page_private(page); count = 1 << order; arch_kexec_pre_free_pages(page_address(page), count); for (i = 0; i < count; i++) ClearPageReserved(page + i); __free_pages(page, order); } void kimage_free_page_list(struct list_head *list) { struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) { list_del(&page->lru); kimage_free_pages(page); } } static struct page *kimage_alloc_normal_control_pages(struct kimage *image, unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages * to their final resting place. As such they must * not conflict with either the destination addresses * or memory the kernel is already using. * * The only case where we really need more than one of * these are for architectures where we cannot disable * the MMU and must instead generate an identity mapped * page table for all of the memory. * * At worst this runs in O(N) of the image size. */ struct list_head extra_pages; struct page *pages; unsigned int count; count = 1 << order; INIT_LIST_HEAD(&extra_pages); /* Loop while I can allocate a page and the page allocated * is a destination page. */ do { unsigned long pfn, epfn, addr, eaddr; pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order); if (!pages) break; pfn = page_to_boot_pfn(pages); epfn = pfn + count; addr = pfn << PAGE_SHIFT; eaddr = epfn << PAGE_SHIFT; if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || kimage_is_destination_range(image, addr, eaddr)) { list_add(&pages->lru, &extra_pages); pages = NULL; } } while (!pages); if (pages) { /* Remember the allocated page... */ list_add(&pages->lru, &image->control_pages); /* Because the page is already in it's destination * location we will never allocate another page at * that address. Therefore kimage_alloc_pages * will not return it (again) and we don't need * to give it an entry in image->segment[]. */ } /* Deal with the destination pages I have inadvertently allocated. * * Ideally I would convert multi-page allocations into single * page allocations, and add everything to image->dest_pages. * * For now it is simpler to just free the pages. */ kimage_free_page_list(&extra_pages); return pages; } static struct page *kimage_alloc_crash_control_pages(struct kimage *image, unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages * to their final resting place. As such they must * not conflict with either the destination addresses * or memory the kernel is already using. * * Control pages are also the only pags we must allocate * when loading a crash kernel. All of the other pages * are specified by the segments and we just memcpy * into them directly. * * The only case where we really need more than one of * these are for architectures where we cannot disable * the MMU and must instead generate an identity mapped * page table for all of the memory. * * Given the low demand this implements a very simple * allocator that finds the first hole of the appropriate * size in the reserved memory region, and allocates all * of the memory up to and including the hole. */ unsigned long hole_start, hole_end, size; struct page *pages; pages = NULL; size = (1 << order) << PAGE_SHIFT; hole_start = (image->control_page + (size - 1)) & ~(size - 1); hole_end = hole_start + size - 1; while (hole_end <= crashk_res.end) { unsigned long i; cond_resched(); if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) break; /* See if I overlap any of the segments */ for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; if ((hole_end >= mstart) && (hole_start <= mend)) { /* Advance the hole to the end of the segment */ hole_start = (mend + (size - 1)) & ~(size - 1); hole_end = hole_start + size - 1; break; } } /* If I don't overlap any segments I have found my hole! */ if (i == image->nr_segments) { pages = pfn_to_page(hole_start >> PAGE_SHIFT); image->control_page = hole_end; break; } } /* Ensure that these pages are decrypted if SME is enabled. */ if (pages) arch_kexec_post_alloc_pages(page_address(pages), 1 << order, 0); return pages; } struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order) { struct page *pages = NULL; switch (image->type) { case KEXEC_TYPE_DEFAULT: pages = kimage_alloc_normal_control_pages(image, order); break; case KEXEC_TYPE_CRASH: pages = kimage_alloc_crash_control_pages(image, order); break; } return pages; } int kimage_crash_copy_vmcoreinfo(struct kimage *image) { struct page *vmcoreinfo_page; void *safecopy; if (image->type != KEXEC_TYPE_CRASH) return 0; /* * For kdump, allocate one vmcoreinfo safe copy from the * crash memory. as we have arch_kexec_protect_crashkres() * after kexec syscall, we naturally protect it from write * (even read) access under kernel direct mapping. But on * the other hand, we still need to operate it when crash * happens to generate vmcoreinfo note, hereby we rely on * vmap for this purpose. */ vmcoreinfo_page = kimage_alloc_control_pages(image, 0); if (!vmcoreinfo_page) { pr_warn("Could not allocate vmcoreinfo buffer\n"); return -ENOMEM; } safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL); if (!safecopy) { pr_warn("Could not vmap vmcoreinfo buffer\n"); return -ENOMEM; } image->vmcoreinfo_data_copy = safecopy; crash_update_vmcoreinfo_safecopy(safecopy); return 0; } static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { if (*image->entry != 0) image->entry++; if (image->entry == image->last_entry) { kimage_entry_t *ind_page; struct page *page; page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); if (!page) return -ENOMEM; ind_page = page_address(page); *image->entry = virt_to_boot_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; image->last_entry = ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); } *image->entry = entry; image->entry++; *image->entry = 0; return 0; } static int kimage_set_destination(struct kimage *image, unsigned long destination) { int result; destination &= PAGE_MASK; result = kimage_add_entry(image, destination | IND_DESTINATION); return result; } static int kimage_add_page(struct kimage *image, unsigned long page) { int result; page &= PAGE_MASK; result = kimage_add_entry(image, page | IND_SOURCE); return result; } static void kimage_free_extra_pages(struct kimage *image) { /* Walk through and free any extra destination pages I may have */ kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ kimage_free_page_list(&image->unusable_pages); } int __weak machine_kexec_post_load(struct kimage *image) { return 0; } void kimage_terminate(struct kimage *image) { if (*image->entry != 0) image->entry++; *image->entry = IND_DONE; } #define for_each_kimage_entry(image, ptr, entry) \ for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ ptr = (entry & IND_INDIRECTION) ? \ boot_phys_to_virt((entry & PAGE_MASK)) : ptr + 1) static void kimage_free_entry(kimage_entry_t entry) { struct page *page; page = boot_pfn_to_page(entry >> PAGE_SHIFT); kimage_free_pages(page); } void kimage_free(struct kimage *image) { kimage_entry_t *ptr, entry; kimage_entry_t ind = 0; if (!image) return; if (image->vmcoreinfo_data_copy) { crash_update_vmcoreinfo_safecopy(NULL); vunmap(image->vmcoreinfo_data_copy); } kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { /* Free the previous indirection page */ if (ind & IND_INDIRECTION) kimage_free_entry(ind); /* Save this indirection page until we are * done with it. */ ind = entry; } else if (entry & IND_SOURCE) kimage_free_entry(entry); } /* Free the final indirection page */ if (ind & IND_INDIRECTION) kimage_free_entry(ind); /* Handle any machine specific cleanup */ machine_kexec_cleanup(image); /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); /* * Free up any temporary buffers allocated. This might hit if * error occurred much later after buffer allocation. */ if (image->file_mode) kimage_file_post_load_cleanup(image); kfree(image); } static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page) { kimage_entry_t *ptr, entry; unsigned long destination = 0; for_each_kimage_entry(image, ptr, entry) { if (entry & IND_DESTINATION) destination = entry & PAGE_MASK; else if (entry & IND_SOURCE) { if (page == destination) return ptr; destination += PAGE_SIZE; } } return NULL; } static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long destination) { /* * Here we implement safeguards to ensure that a source page * is not copied to its destination page before the data on * the destination page is no longer useful. * * To do this we maintain the invariant that a source page is * either its own destination page, or it is not a * destination page at all. * * That is slightly stronger than required, but the proof * that no problems will not occur is trivial, and the * implementation is simply to verify. * * When allocating all pages normally this algorithm will run * in O(N) time, but in the worst case it will run in O(N^2) * time. If the runtime is a problem the data structures can * be fixed. */ struct page *page; unsigned long addr; /* * Walk through the list of destination pages, and see if I * have a match. */ list_for_each_entry(page, &image->dest_pages, lru) { addr = page_to_boot_pfn(page) << PAGE_SHIFT; if (addr == destination) { list_del(&page->lru); return page; } } page = NULL; while (1) { kimage_entry_t *old; /* Allocate a page, if we run out of memory give up */ page = kimage_alloc_pages(gfp_mask, 0); if (!page) return NULL; /* If the page cannot be used file it away */ if (page_to_boot_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_boot_pfn(page) << PAGE_SHIFT; /* If it is the destination page we want use it */ if (addr == destination) break; /* If the page is not a destination page use it */ if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE)) break; /* * I know that the page is someones destination page. * See if there is already a source page for this * destination page. And if so swap the source pages. */ old = kimage_dst_used(image, addr); if (old) { /* If so move it */ unsigned long old_addr; struct page *old_page; old_addr = *old & PAGE_MASK; old_page = boot_pfn_to_page(old_addr >> PAGE_SHIFT); copy_highpage(page, old_page); *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a * destination page, so return it if it's * gfp_flags honor the ones passed in. */ if (!(gfp_mask & __GFP_HIGHMEM) && PageHighMem(old_page)) { kimage_free_pages(old_page); continue; } addr = old_addr; page = old_page; break; } /* Place the page on the destination list, to be used later */ list_add(&page->lru, &image->dest_pages); } return page; } static int kimage_load_normal_segment(struct kimage *image, struct kexec_segment *segment) { unsigned long maddr; size_t ubytes, mbytes; int result; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; result = 0; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; result = kimage_set_destination(image, maddr); if (result < 0) goto out; while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); if (!page) { result = -ENOMEM; goto out; } result = kimage_add_page(image, page_to_boot_pfn(page) << PAGE_SHIFT); if (result < 0) goto out; ptr = kmap(page); /* Start with a clear page */ clear_page(ptr); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { result = -EFAULT; goto out; } ubytes -= uchunk; maddr += mchunk; if (image->file_mode) kbuf += mchunk; else buf += mchunk; mbytes -= mchunk; cond_resched(); } out: return result; } static int kimage_load_crash_segment(struct kimage *image, struct kexec_segment *segment) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. * We do things a page at a time for the sake of kmap. */ unsigned long maddr; size_t ubytes, mbytes; int result; unsigned char __user *buf = NULL; unsigned char *kbuf = NULL; result = 0; if (image->file_mode) kbuf = segment->kbuf; else buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; page = boot_pfn_to_page(maddr >> PAGE_SHIFT); if (!page) { result = -ENOMEM; goto out; } arch_kexec_post_alloc_pages(page_address(page), 1, 0); ptr = kmap(page); ptr += maddr & ~PAGE_MASK; mchunk = min_t(size_t, mbytes, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); if (mchunk > uchunk) { /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } /* For file based kexec, source pages are in kernel memory */ if (image->file_mode) memcpy(ptr, kbuf, uchunk); else result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); arch_kexec_pre_free_pages(page_address(page), 1); if (result) { result = -EFAULT; goto out; } ubytes -= uchunk; maddr += mchunk; if (image->file_mode) kbuf += mchunk; else buf += mchunk; mbytes -= mchunk; cond_resched(); } out: return result; } int kimage_load_segment(struct kimage *image, struct kexec_segment *segment) { int result = -ENOMEM; switch (image->type) { case KEXEC_TYPE_DEFAULT: result = kimage_load_normal_segment(image, segment); break; case KEXEC_TYPE_CRASH: result = kimage_load_crash_segment(image, segment); break; } return result; } struct kimage *kexec_image; struct kimage *kexec_crash_image; int kexec_load_disabled; /* * No panic_cpu check version of crash_kexec(). This function is called * only when panic_cpu holds the current CPU number; this is the only CPU * which processes crash_kexec routines. */ void __noclone __crash_kexec(struct pt_regs *regs) { /* Take the kexec_lock here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * * If the crash kernel was not located in a fixed area * of memory the xchg(&kexec_crash_image) would be * sufficient. But since I reuse the memory... */ if (kexec_trylock()) { if (kexec_crash_image) { struct pt_regs fixed_regs; crash_setup_regs(&fixed_regs, regs); crash_save_vmcoreinfo(); machine_crash_shutdown(&fixed_regs); machine_kexec(kexec_crash_image); } kexec_unlock(); } } STACK_FRAME_NON_STANDARD(__crash_kexec); void crash_kexec(struct pt_regs *regs) { int old_cpu, this_cpu; /* * Only one CPU is allowed to execute the crash_kexec() code as with * panic(). Otherwise parallel calls of panic() and crash_kexec() * may stop each other. To exclude them, we use panic_cpu here too. */ this_cpu = raw_smp_processor_id(); old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ __crash_kexec(regs); /* * Reset panic_cpu to allow another panic()/crash_kexec() * call. */ atomic_set(&panic_cpu, PANIC_CPU_INVALID); } } ssize_t crash_get_memory_size(void) { ssize_t size = 0; if (!kexec_trylock()) return -EBUSY; if (crashk_res.end != crashk_res.start) size = resource_size(&crashk_res); kexec_unlock(); return size; } void __weak crash_free_reserved_phys_range(unsigned long begin, unsigned long end) { unsigned long addr; for (addr = begin; addr < end; addr += PAGE_SIZE) free_reserved_page(boot_pfn_to_page(addr >> PAGE_SHIFT)); } int crash_shrink_memory(unsigned long new_size) { int ret = 0; unsigned long start, end; unsigned long old_size; struct resource *ram_res; if (!kexec_trylock()) return -EBUSY; if (kexec_crash_image) { ret = -ENOENT; goto unlock; } start = crashk_res.start; end = crashk_res.end; old_size = (end == 0) ? 0 : end - start + 1; new_size = roundup(new_size, KEXEC_CRASH_MEM_ALIGN); if (new_size >= old_size) { ret = (new_size == old_size) ? 0 : -EINVAL; goto unlock; } ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); if (!ram_res) { ret = -ENOMEM; goto unlock; } end = start + new_size; crash_free_reserved_phys_range(end, crashk_res.end); if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); ram_res->start = end; ram_res->end = crashk_res.end; ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; ram_res->name = "System RAM"; crashk_res.end = end - 1; insert_resource(&iomem_resource, ram_res); unlock: kexec_unlock(); return ret; } void crash_save_cpu(struct pt_regs *regs, int cpu) { struct elf_prstatus prstatus; u32 *buf; if ((cpu < 0) || (cpu >= nr_cpu_ids)) return; /* Using ELF notes here is opportunistic. * I need a well defined structure format * for the data I pass, and I need tags * on the data to indicate what information I have * squirrelled away. ELF notes happen to provide * all of that, so there is no need to invent something new. */ buf = (u32 *)per_cpu_ptr(crash_notes, cpu); if (!buf) return; memset(&prstatus, 0, sizeof(prstatus)); prstatus.common.pr_pid = current->pid; elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); } static int __init crash_notes_memory_init(void) { /* Allocate memory for saving cpu registers. */ size_t size, align; /* * crash_notes could be allocated across 2 vmalloc pages when percpu * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc * pages are also on 2 continuous physical pages. In this case the * 2nd part of crash_notes in 2nd page could be lost since only the * starting address and size of crash_notes are exported through sysfs. * Here round up the size of crash_notes to the nearest power of two * and pass it to __alloc_percpu as align value. This can make sure * crash_notes is allocated inside one physical page. */ size = sizeof(note_buf_t); align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE); /* * Break compile if size is bigger than PAGE_SIZE since crash_notes * definitely will be in 2 pages with that. */ BUILD_BUG_ON(size > PAGE_SIZE); crash_notes = __alloc_percpu(size, align); if (!crash_notes) { pr_warn("Memory allocation for saving cpu register states failed\n"); return -ENOMEM; } return 0; } subsys_initcall(crash_notes_memory_init); /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. */ int kernel_kexec(void) { int error = 0; if (!kexec_trylock()) return -EBUSY; if (!kexec_image) { error = -EINVAL; goto Unlock; } #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { pm_prepare_console(); error = freeze_processes(); if (error) { error = -EBUSY; goto Restore_console; } suspend_console(); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Resume_console; /* At this point, dpm_suspend_start() has been called, * but *not* dpm_suspend_end(). We *must* call * dpm_suspend_end() now. Otherwise, drivers for * some devices (e.g. interrupt controllers) become * desynchronized with the actual state of the * hardware at resume time, and evil weirdness ensues. */ error = dpm_suspend_end(PMSG_FREEZE); if (error) goto Resume_devices; error = suspend_disable_secondary_cpus(); if (error) goto Enable_cpus; local_irq_disable(); error = syscore_suspend(); if (error) goto Enable_irqs; } else #endif { kexec_in_progress = true; kernel_restart_prepare("kexec reboot"); migrate_to_reboot_cpu(); /* * migrate_to_reboot_cpu() disables CPU hotplug assuming that * no further code needs to use CPU hotplug (which is true in * the reboot case). However, the kexec path depends on using * CPU hotplug again; so re-enable it here. */ cpu_hotplug_enable(); pr_notice("Starting new kernel\n"); machine_shutdown(); } kmsg_dump(KMSG_DUMP_SHUTDOWN); machine_kexec(kexec_image); #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { syscore_resume(); Enable_irqs: local_irq_enable(); Enable_cpus: suspend_enable_secondary_cpus(); dpm_resume_start(PMSG_RESTORE); Resume_devices: dpm_resume_end(PMSG_RESTORE); Resume_console: resume_console(); thaw_processes(); Restore_console: pm_restore_console(); } #endif Unlock: kexec_unlock(); return error; } /* * Protection mechanism for crashkernel reserved memory after * the kdump kernel is loaded. * * Provide an empty default implementation here -- architecture * code may override this */ void __weak arch_kexec_protect_crashkres(void) {} void __weak arch_kexec_unprotect_crashkres(void) {}
3 24 8 1 1 6 6 6 8 8 8 2 2 8 8 8 2 8 8 23 6 23 6 13 6 1 8 1 1 4 10 2 2 2 2 2 2 8 8 8 8 8 2 8 8 8 6 2 2 2 2 2 2 13 6 6 4 6 24 24 24 23 6 20 23 13 13 13 14 14 14 13 16 16 16 1 1 15 16 14 14 1 14 14 1 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 11 8 1 1 3 3 3 3 3 3 4 10 10 10 10 10 4 10 8 8 10 6 2 2 10 6 10 6 10 10 1 10 1 6 4 10 10 10 7 3 10 3 1 3 27 27 6 29 23 29 29 29 1 11 3 924 925 913 28 27 27 29 6 6 6 6 32 32 30 32 32 32 32 32 32 32 32 3 3 2 1 1 1 8 8 1 1 3 2 3 3 3 1 1 1 1 1 1 1 1 6 6 6 6 2 2 2 2 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 2 1 1 1 2 3 3 3 3 3 3 3 3 3 3 3 3 8 8 8 2 2 8 6 2 2 2 9 1 8 2 2 2 2 2 2 1 6 6 6 6 6 6 6 6 6 6 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 /* * originally based on the dummy device. * * Copyright 1999, Thomas Davis, tadavis@lbl.gov. * Licensed under the GPL. Based on dummy.c, and eql.c devices. * * bonding.c: an Ethernet Bonding driver * * This is useful to talk to a Cisco EtherChannel compatible equipment: * Cisco 5500 * Sun Trunking (Solaris) * Alteon AceDirector Trunks * Linux Bonding * and probably many L2 switches ... * * How it works: * ifconfig bond0 ipaddress netmask up * will setup a network device, with an ip address. No mac address * will be assigned at this time. The hw mac address will come from * the first slave bonded to the channel. All slaves will then use * this hw mac address. * * ifconfig bond0 down * will release all slaves, marking them as down. * * ifenslave bond0 eth0 * will attach eth0 to bond0 as a slave. eth0 hw mac address will either * a: be used as initial mac address * b: if a hw mac address already is there, eth0's hw mac address * will then be set from bond0. * */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/interrupt.h> #include <linux/ptrace.h> #include <linux/ioport.h> #include <linux/in.h> #include <net/ip.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/init.h> #include <linux/timer.h> #include <linux/socket.h> #include <linux/ctype.h> #include <linux/inet.h> #include <linux/bitops.h> #include <linux/io.h> #include <asm/dma.h> #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/rtnetlink.h> #include <linux/smp.h> #include <linux/if_ether.h> #include <net/arp.h> #include <linux/mii.h> #include <linux/ethtool.h> #include <linux/if_vlan.h> #include <linux/if_bonding.h> #include <linux/jiffies.h> #include <linux/preempt.h> #include <net/route.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/pkt_sched.h> #include <linux/rculist.h> #include <net/flow_dissector.h> #include <net/xfrm.h> #include <net/bonding.h> #include <net/bond_3ad.h> #include <net/bond_alb.h> #if IS_ENABLED(CONFIG_TLS_DEVICE) #include <net/tls.h> #endif #include "bonding_priv.h" /*---------------------------- Module parameters ----------------------------*/ /* monitor all links that often (in milliseconds). <=0 disables monitoring */ static int max_bonds = BOND_DEFAULT_MAX_BONDS; static int tx_queues = BOND_DEFAULT_TX_QUEUES; static int num_peer_notif = 1; static int miimon; static int updelay; static int downdelay; static int use_carrier = 1; static char *mode; static char *primary; static char *primary_reselect; static char *lacp_rate; static int min_links; static char *ad_select; static char *xmit_hash_policy; static int arp_interval; static char *arp_ip_target[BOND_MAX_ARP_TARGETS]; static char *arp_validate; static char *arp_all_targets; static char *fail_over_mac; static int all_slaves_active; static struct bond_params bonding_defaults; static int resend_igmp = BOND_DEFAULT_RESEND_IGMP; static int packets_per_slave = 1; static int lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL; module_param(max_bonds, int, 0); MODULE_PARM_DESC(max_bonds, "Max number of bonded devices"); module_param(tx_queues, int, 0); MODULE_PARM_DESC(tx_queues, "Max number of transmit queues (default = 16)"); module_param_named(num_grat_arp, num_peer_notif, int, 0644); MODULE_PARM_DESC(num_grat_arp, "Number of peer notifications to send on " "failover event (alias of num_unsol_na)"); module_param_named(num_unsol_na, num_peer_notif, int, 0644); MODULE_PARM_DESC(num_unsol_na, "Number of peer notifications to send on " "failover event (alias of num_grat_arp)"); module_param(miimon, int, 0); MODULE_PARM_DESC(miimon, "Link check interval in milliseconds"); module_param(updelay, int, 0); MODULE_PARM_DESC(updelay, "Delay before considering link up, in milliseconds"); module_param(downdelay, int, 0); MODULE_PARM_DESC(downdelay, "Delay before considering link down, " "in milliseconds"); module_param(use_carrier, int, 0); MODULE_PARM_DESC(use_carrier, "Use netif_carrier_ok (vs MII ioctls) in miimon; " "0 for off, 1 for on (default)"); module_param(mode, charp, 0); MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, " "1 for active-backup, 2 for balance-xor, " "3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, " "6 for balance-alb"); module_param(primary, charp, 0); MODULE_PARM_DESC(primary, "Primary network device to use"); module_param(primary_reselect, charp, 0); MODULE_PARM_DESC(primary_reselect, "Reselect primary slave " "once it comes up; " "0 for always (default), " "1 for only if speed of primary is " "better, " "2 for only on active slave " "failure"); module_param(lacp_rate, charp, 0); MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner; " "0 for slow, 1 for fast"); module_param(ad_select, charp, 0); MODULE_PARM_DESC(ad_select, "802.3ad aggregation selection logic; " "0 for stable (default), 1 for bandwidth, " "2 for count"); module_param(min_links, int, 0); MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on carrier"); module_param(xmit_hash_policy, charp, 0); MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; " "0 for layer 2 (default), 1 for layer 3+4, " "2 for layer 2+3, 3 for encap layer 2+3, " "4 for encap layer 3+4, 5 for vlan+srcmac"); module_param(arp_interval, int, 0); MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds"); module_param_array(arp_ip_target, charp, NULL, 0); MODULE_PARM_DESC(arp_ip_target, "arp targets in n.n.n.n form"); module_param(arp_validate, charp, 0); MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes; " "0 for none (default), 1 for active, " "2 for backup, 3 for all"); module_param(arp_all_targets, charp, 0); MODULE_PARM_DESC(arp_all_targets, "fail on any/all arp targets timeout; 0 for any (default), 1 for all"); module_param(fail_over_mac, charp, 0); MODULE_PARM_DESC(fail_over_mac, "For active-backup, do not set all slaves to " "the same MAC; 0 for none (default), " "1 for active, 2 for follow"); module_param(all_slaves_active, int, 0); MODULE_PARM_DESC(all_slaves_active, "Keep all frames received on an interface " "by setting active flag for all slaves; " "0 for never (default), 1 for always."); module_param(resend_igmp, int, 0); MODULE_PARM_DESC(resend_igmp, "Number of IGMP membership reports to send on " "link failure"); module_param(packets_per_slave, int, 0); MODULE_PARM_DESC(packets_per_slave, "Packets to send per slave in balance-rr " "mode; 0 for a random slave, 1 packet per " "slave (default), >1 packets per slave."); module_param(lp_interval, uint, 0); MODULE_PARM_DESC(lp_interval, "The number of seconds between instances where " "the bonding driver sends learning packets to " "each slaves peer switch. The default is 1."); /*----------------------------- Global variables ----------------------------*/ #ifdef CONFIG_NET_POLL_CONTROLLER atomic_t netpoll_block_tx = ATOMIC_INIT(0); #endif unsigned int bond_net_id __read_mostly; static const struct flow_dissector_key flow_keys_bonding_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_TIPC, .offset = offsetof(struct flow_keys, addrs.tipckey), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, { .key_id = FLOW_DISSECTOR_KEY_ICMP, .offset = offsetof(struct flow_keys, icmp), }, { .key_id = FLOW_DISSECTOR_KEY_VLAN, .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, .offset = offsetof(struct flow_keys, tags), }, { .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, .offset = offsetof(struct flow_keys, keyid), }, }; static struct flow_dissector flow_keys_bonding __read_mostly; /*-------------------------- Forward declarations ---------------------------*/ static int bond_init(struct net_device *bond_dev); static void bond_uninit(struct net_device *bond_dev); static void bond_get_stats(struct net_device *bond_dev, struct rtnl_link_stats64 *stats); static void bond_slave_arr_handler(struct work_struct *work); static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act, int mod); static void bond_netdev_notify_work(struct work_struct *work); /*---------------------------- General routines -----------------------------*/ const char *bond_mode_name(int mode) { static const char *names[] = { [BOND_MODE_ROUNDROBIN] = "load balancing (round-robin)", [BOND_MODE_ACTIVEBACKUP] = "fault-tolerance (active-backup)", [BOND_MODE_XOR] = "load balancing (xor)", [BOND_MODE_BROADCAST] = "fault-tolerance (broadcast)", [BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation", [BOND_MODE_TLB] = "transmit load balancing", [BOND_MODE_ALB] = "adaptive load balancing", }; if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_ALB) return "unknown"; return names[mode]; } /** * bond_dev_queue_xmit - Prepare skb for xmit. * * @bond: bond device that got this skb for tx. * @skb: hw accel VLAN tagged skb to transmit * @slave_dev: slave that is supposed to xmit this skbuff */ netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev) { skb->dev = slave_dev; BUILD_BUG_ON(sizeof(skb->queue_mapping) != sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); if (unlikely(netpoll_tx_running(bond->dev))) return bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb); return dev_queue_xmit(skb); } bool bond_sk_check(struct bonding *bond) { switch (BOND_MODE(bond)) { case BOND_MODE_8023AD: case BOND_MODE_XOR: if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34) return true; fallthrough; default: return false; } } static bool bond_xdp_check(struct bonding *bond) { switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: case BOND_MODE_ACTIVEBACKUP: return true; case BOND_MODE_8023AD: case BOND_MODE_XOR: /* vlan+srcmac is not supported with XDP as in most cases the 802.1q * payload is not in the packet due to hardware offload. */ if (bond->params.xmit_policy != BOND_XMIT_POLICY_VLAN_SRCMAC) return true; fallthrough; default: return false; } } /*---------------------------------- VLAN -----------------------------------*/ /* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid, * We don't protect the slave list iteration with a lock because: * a. This operation is performed in IOCTL context, * b. The operation is protected by the RTNL semaphore in the 8021q code, * c. Holding a lock with BH disabled while directly calling a base driver * entry point is generally a BAD idea. * * The design of synchronization/protection for this operation in the 8021q * module is good for one or more VLAN devices over a single physical device * and cannot be extended for a teaming solution like bonding, so there is a * potential race condition here where a net device from the vlan group might * be referenced (either by a base driver or the 8021q code) while it is being * removed from the system. However, it turns out we're not making matters * worse, and if it works for regular VLAN usage it will work here too. */ /** * bond_vlan_rx_add_vid - Propagates adding an id to slaves * @bond_dev: bonding net device that got called * @proto: network protocol ID * @vid: vlan id being added */ static int bond_vlan_rx_add_vid(struct net_device *bond_dev, __be16 proto, u16 vid) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; struct list_head *iter; int res; bond_for_each_slave(bond, slave, iter) { res = vlan_vid_add(slave->dev, proto, vid); if (res) goto unwind; } return 0; unwind: /* unwind to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { if (rollback_slave == slave) break; vlan_vid_del(rollback_slave->dev, proto, vid); } return res; } /** * bond_vlan_rx_kill_vid - Propagates deleting an id to slaves * @bond_dev: bonding net device that got called * @proto: network protocol ID * @vid: vlan id being removed */ static int bond_vlan_rx_kill_vid(struct net_device *bond_dev, __be16 proto, u16 vid) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; bond_for_each_slave(bond, slave, iter) vlan_vid_del(slave->dev, proto, vid); if (bond_is_lb(bond)) bond_alb_clear_vlan(bond, vid); return 0; } /*---------------------------------- XFRM -----------------------------------*/ #ifdef CONFIG_XFRM_OFFLOAD /** * bond_ipsec_add_sa - program device with a security association * @xs: pointer to transformer state struct **/ static int bond_ipsec_add_sa(struct xfrm_state *xs) { struct net_device *bond_dev = xs->xso.dev; struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; int err; if (!bond_dev) return -EINVAL; rcu_read_lock(); bond = netdev_priv(bond_dev); slave = rcu_dereference(bond->curr_active_slave); if (!slave) { rcu_read_unlock(); return -ENODEV; } if (!slave->dev->xfrmdev_ops || !slave->dev->xfrmdev_ops->xdo_dev_state_add || netif_is_bond_master(slave->dev)) { slave_warn(bond_dev, slave->dev, "Slave does not support ipsec offload\n"); rcu_read_unlock(); return -EINVAL; } ipsec = kmalloc(sizeof(*ipsec), GFP_ATOMIC); if (!ipsec) { rcu_read_unlock(); return -ENOMEM; } xs->xso.real_dev = slave->dev; err = slave->dev->xfrmdev_ops->xdo_dev_state_add(xs); if (!err) { ipsec->xs = xs; INIT_LIST_HEAD(&ipsec->list); spin_lock_bh(&bond->ipsec_lock); list_add(&ipsec->list, &bond->ipsec_list); spin_unlock_bh(&bond->ipsec_lock); } else { kfree(ipsec); } rcu_read_unlock(); return err; } static void bond_ipsec_add_sa_all(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_ipsec *ipsec; struct slave *slave; rcu_read_lock(); slave = rcu_dereference(bond->curr_active_slave); if (!slave) goto out; if (!slave->dev->xfrmdev_ops || !slave->dev->xfrmdev_ops->xdo_dev_state_add || netif_is_bond_master(slave->dev)) { spin_lock_bh(&bond->ipsec_lock); if (!list_empty(&bond->ipsec_list)) slave_warn(bond_dev, slave->dev, "%s: no slave xdo_dev_state_add\n", __func__); spin_unlock_bh(&bond->ipsec_lock); goto out; } spin_lock_bh(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { ipsec->xs->xso.real_dev = slave->dev; if (slave->dev->xfrmdev_ops->xdo_dev_state_add(ipsec->xs)) { slave_warn(bond_dev, slave->dev, "%s: failed to add SA\n", __func__); ipsec->xs->xso.real_dev = NULL; } } spin_unlock_bh(&bond->ipsec_lock); out: rcu_read_unlock(); } /** * bond_ipsec_del_sa - clear out this specific SA * @xs: pointer to transformer state struct **/ static void bond_ipsec_del_sa(struct xfrm_state *xs) { struct net_device *bond_dev = xs->xso.dev; struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; if (!bond_dev) return; rcu_read_lock(); bond = netdev_priv(bond_dev); slave = rcu_dereference(bond->curr_active_slave); if (!slave) goto out; if (!xs->xso.real_dev) goto out; WARN_ON(xs->xso.real_dev != slave->dev); if (!slave->dev->xfrmdev_ops || !slave->dev->xfrmdev_ops->xdo_dev_state_delete || netif_is_bond_master(slave->dev)) { slave_warn(bond_dev, slave->dev, "%s: no slave xdo_dev_state_delete\n", __func__); goto out; } slave->dev->xfrmdev_ops->xdo_dev_state_delete(xs); out: spin_lock_bh(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (ipsec->xs == xs) { list_del(&ipsec->list); kfree(ipsec); break; } } spin_unlock_bh(&bond->ipsec_lock); rcu_read_unlock(); } static void bond_ipsec_del_sa_all(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct bond_ipsec *ipsec; struct slave *slave; rcu_read_lock(); slave = rcu_dereference(bond->curr_active_slave); if (!slave) { rcu_read_unlock(); return; } spin_lock_bh(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (!ipsec->xs->xso.real_dev) continue; if (!slave->dev->xfrmdev_ops || !slave->dev->xfrmdev_ops->xdo_dev_state_delete || netif_is_bond_master(slave->dev)) { slave_warn(bond_dev, slave->dev, "%s: no slave xdo_dev_state_delete\n", __func__); } else { slave->dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs); } } spin_unlock_bh(&bond->ipsec_lock); rcu_read_unlock(); } /** * bond_ipsec_offload_ok - can this packet use the xfrm hw offload * @skb: current data packet * @xs: pointer to transformer state struct **/ static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs) { struct net_device *bond_dev = xs->xso.dev; struct net_device *real_dev; struct slave *curr_active; struct bonding *bond; bool ok = false; bond = netdev_priv(bond_dev); rcu_read_lock(); curr_active = rcu_dereference(bond->curr_active_slave); if (!curr_active) goto out; real_dev = curr_active->dev; if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) goto out; if (!xs->xso.real_dev) goto out; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_offload_ok || netif_is_bond_master(real_dev)) goto out; ok = real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs); out: rcu_read_unlock(); return ok; } static const struct xfrmdev_ops bond_xfrmdev_ops = { .xdo_dev_state_add = bond_ipsec_add_sa, .xdo_dev_state_delete = bond_ipsec_del_sa, .xdo_dev_offload_ok = bond_ipsec_offload_ok, }; #endif /* CONFIG_XFRM_OFFLOAD */ /*------------------------------- Link status -------------------------------*/ /* Set the carrier state for the master according to the state of its * slaves. If any slaves are up, the master is up. In 802.3ad mode, * do special 802.3ad magic. * * Returns zero if carrier state does not change, nonzero if it does. */ int bond_set_carrier(struct bonding *bond) { struct list_head *iter; struct slave *slave; if (!bond_has_slaves(bond)) goto down; if (BOND_MODE(bond) == BOND_MODE_8023AD) return bond_3ad_set_carrier(bond); bond_for_each_slave(bond, slave, iter) { if (slave->link == BOND_LINK_UP) { if (!netif_carrier_ok(bond->dev)) { netif_carrier_on(bond->dev); return 1; } return 0; } } down: if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); return 1; } return 0; } /* Get link speed and duplex from the slave's base driver * using ethtool. If for some reason the call fails or the * values are invalid, set speed and duplex to -1, * and return. Return 1 if speed or duplex settings are * UNKNOWN; 0 otherwise. */ static int bond_update_speed_duplex(struct slave *slave) { struct net_device *slave_dev = slave->dev; struct ethtool_link_ksettings ecmd; int res; slave->speed = SPEED_UNKNOWN; slave->duplex = DUPLEX_UNKNOWN; res = __ethtool_get_link_ksettings(slave_dev, &ecmd); if (res < 0) return 1; if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) return 1; switch (ecmd.base.duplex) { case DUPLEX_FULL: case DUPLEX_HALF: break; default: return 1; } slave->speed = ecmd.base.speed; slave->duplex = ecmd.base.duplex; return 0; } const char *bond_slave_link_status(s8 link) { switch (link) { case BOND_LINK_UP: return "up"; case BOND_LINK_FAIL: return "going down"; case BOND_LINK_DOWN: return "down"; case BOND_LINK_BACK: return "going back"; default: return "unknown"; } } /* if <dev> supports MII link status reporting, check its link status. * * We either do MII/ETHTOOL ioctls, or check netif_carrier_ok(), * depending upon the setting of the use_carrier parameter. * * Return either BMSR_LSTATUS, meaning that the link is up (or we * can't tell and just pretend it is), or 0, meaning that the link is * down. * * If reporting is non-zero, instead of faking link up, return -1 if * both ETHTOOL and MII ioctls fail (meaning the device does not * support them). If use_carrier is set, return whatever it says. * It'd be nice if there was a good way to tell if a driver supports * netif_carrier, but there really isn't. */ static int bond_check_dev_link(struct bonding *bond, struct net_device *slave_dev, int reporting) { const struct net_device_ops *slave_ops = slave_dev->netdev_ops; int (*ioctl)(struct net_device *, struct ifreq *, int); struct ifreq ifr; struct mii_ioctl_data *mii; if (!reporting && !netif_running(slave_dev)) return 0; if (bond->params.use_carrier) return netif_carrier_ok(slave_dev) ? BMSR_LSTATUS : 0; /* Try to get link status using Ethtool first. */ if (slave_dev->ethtool_ops->get_link) return slave_dev->ethtool_ops->get_link(slave_dev) ? BMSR_LSTATUS : 0; /* Ethtool can't be used, fallback to MII ioctls. */ ioctl = slave_ops->ndo_eth_ioctl; if (ioctl) { /* TODO: set pointer to correct ioctl on a per team member * bases to make this more efficient. that is, once * we determine the correct ioctl, we will always * call it and not the others for that team * member. */ /* We cannot assume that SIOCGMIIPHY will also read a * register; not all network drivers (e.g., e100) * support that. */ /* Yes, the mii is overlaid on the ifreq.ifr_ifru */ strscpy_pad(ifr.ifr_name, slave_dev->name, IFNAMSIZ); mii = if_mii(&ifr); if (ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) { mii->reg_num = MII_BMSR; if (ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0) return mii->val_out & BMSR_LSTATUS; } } /* If reporting, report that either there's no ndo_eth_ioctl, * or both SIOCGMIIREG and get_link failed (meaning that we * cannot report link status). If not reporting, pretend * we're ok. */ return reporting ? -1 : BMSR_LSTATUS; } /*----------------------------- Multicast list ------------------------------*/ /* Push the promiscuity flag down to appropriate slaves */ static int bond_set_promiscuity(struct bonding *bond, int inc) { struct list_head *iter; int err = 0; if (bond_uses_primary(bond)) { struct slave *curr_active = rtnl_dereference(bond->curr_active_slave); if (curr_active) err = dev_set_promiscuity(curr_active->dev, inc); } else { struct slave *slave; bond_for_each_slave(bond, slave, iter) { err = dev_set_promiscuity(slave->dev, inc); if (err) return err; } } return err; } /* Push the allmulti flag down to all slaves */ static int bond_set_allmulti(struct bonding *bond, int inc) { struct list_head *iter; int err = 0; if (bond_uses_primary(bond)) { struct slave *curr_active = rtnl_dereference(bond->curr_active_slave); if (curr_active) err = dev_set_allmulti(curr_active->dev, inc); } else { struct slave *slave; bond_for_each_slave(bond, slave, iter) { err = dev_set_allmulti(slave->dev, inc); if (err) return err; } } return err; } /* Retrieve the list of registered multicast addresses for the bonding * device and retransmit an IGMP JOIN request to the current active * slave. */ static void bond_resend_igmp_join_requests_delayed(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, mcast_work.work); if (!rtnl_trylock()) { queue_delayed_work(bond->wq, &bond->mcast_work, 1); return; } call_netdevice_notifiers(NETDEV_RESEND_IGMP, bond->dev); if (bond->igmp_retrans > 1) { bond->igmp_retrans--; queue_delayed_work(bond->wq, &bond->mcast_work, HZ/5); } rtnl_unlock(); } /* Flush bond's hardware addresses from slave */ static void bond_hw_addr_flush(struct net_device *bond_dev, struct net_device *slave_dev) { struct bonding *bond = netdev_priv(bond_dev); dev_uc_unsync(slave_dev, bond_dev); dev_mc_unsync(slave_dev, bond_dev); if (BOND_MODE(bond) == BOND_MODE_8023AD) dev_mc_del(slave_dev, lacpdu_mcast_addr); } /*--------------------------- Active slave change ---------------------------*/ /* Update the hardware address list and promisc/allmulti for the new and * old active slaves (if any). Modes that are not using primary keep all * slaves up date at all times; only the modes that use primary need to call * this function to swap these settings during a failover. */ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active, struct slave *old_active) { if (old_active) { if (bond->dev->flags & IFF_PROMISC) dev_set_promiscuity(old_active->dev, -1); if (bond->dev->flags & IFF_ALLMULTI) dev_set_allmulti(old_active->dev, -1); if (bond->dev->flags & IFF_UP) bond_hw_addr_flush(bond->dev, old_active->dev); } if (new_active) { /* FIXME: Signal errors upstream. */ if (bond->dev->flags & IFF_PROMISC) dev_set_promiscuity(new_active->dev, 1); if (bond->dev->flags & IFF_ALLMULTI) dev_set_allmulti(new_active->dev, 1); if (bond->dev->flags & IFF_UP) { netif_addr_lock_bh(bond->dev); dev_uc_sync(new_active->dev, bond->dev); dev_mc_sync(new_active->dev, bond->dev); netif_addr_unlock_bh(bond->dev); } } } /** * bond_set_dev_addr - clone slave's address to bond * @bond_dev: bond net device * @slave_dev: slave net device * * Should be called with RTNL held. */ static int bond_set_dev_addr(struct net_device *bond_dev, struct net_device *slave_dev) { int err; slave_dbg(bond_dev, slave_dev, "bond_dev=%p slave_dev=%p slave_dev->addr_len=%d\n", bond_dev, slave_dev, slave_dev->addr_len); err = dev_pre_changeaddr_notify(bond_dev, slave_dev->dev_addr, NULL); if (err) return err; memcpy(bond_dev->dev_addr, slave_dev->dev_addr, slave_dev->addr_len); bond_dev->addr_assign_type = NET_ADDR_STOLEN; call_netdevice_notifiers(NETDEV_CHANGEADDR, bond_dev); return 0; } static struct slave *bond_get_old_active(struct bonding *bond, struct slave *new_active) { struct slave *slave; struct list_head *iter; bond_for_each_slave(bond, slave, iter) { if (slave == new_active) continue; if (ether_addr_equal(bond->dev->dev_addr, slave->dev->dev_addr)) return slave; } return NULL; } /* bond_do_fail_over_mac * * Perform special MAC address swapping for fail_over_mac settings * * Called with RTNL */ static void bond_do_fail_over_mac(struct bonding *bond, struct slave *new_active, struct slave *old_active) { u8 tmp_mac[MAX_ADDR_LEN]; struct sockaddr_storage ss; int rv; switch (bond->params.fail_over_mac) { case BOND_FOM_ACTIVE: if (new_active) { rv = bond_set_dev_addr(bond->dev, new_active->dev); if (rv) slave_err(bond->dev, new_active->dev, "Error %d setting bond MAC from slave\n", -rv); } break; case BOND_FOM_FOLLOW: /* if new_active && old_active, swap them * if just old_active, do nothing (going to no active slave) * if just new_active, set new_active to bond's MAC */ if (!new_active) return; if (!old_active) old_active = bond_get_old_active(bond, new_active); if (old_active) { bond_hw_addr_copy(tmp_mac, new_active->dev->dev_addr, new_active->dev->addr_len); bond_hw_addr_copy(ss.__data, old_active->dev->dev_addr, old_active->dev->addr_len); ss.ss_family = new_active->dev->type; } else { bond_hw_addr_copy(ss.__data, bond->dev->dev_addr, bond->dev->addr_len); ss.ss_family = bond->dev->type; } rv = dev_set_mac_address(new_active->dev, (struct sockaddr *)&ss, NULL); if (rv) { slave_err(bond->dev, new_active->dev, "Error %d setting MAC of new active slave\n", -rv); goto out; } if (!old_active) goto out; bond_hw_addr_copy(ss.__data, tmp_mac, new_active->dev->addr_len); ss.ss_family = old_active->dev->type; rv = dev_set_mac_address(old_active->dev, (struct sockaddr *)&ss, NULL); if (rv) slave_err(bond->dev, old_active->dev, "Error %d setting MAC of old active slave\n", -rv); out: break; default: netdev_err(bond->dev, "bond_do_fail_over_mac impossible: bad policy %d\n", bond->params.fail_over_mac); break; } } static struct slave *bond_choose_primary_or_current(struct bonding *bond) { struct slave *prim = rtnl_dereference(bond->primary_slave); struct slave *curr = rtnl_dereference(bond->curr_active_slave); if (!prim || prim->link != BOND_LINK_UP) { if (!curr || curr->link != BOND_LINK_UP) return NULL; return curr; } if (bond->force_primary) { bond->force_primary = false; return prim; } if (!curr || curr->link != BOND_LINK_UP) return prim; /* At this point, prim and curr are both up */ switch (bond->params.primary_reselect) { case BOND_PRI_RESELECT_ALWAYS: return prim; case BOND_PRI_RESELECT_BETTER: if (prim->speed < curr->speed) return curr; if (prim->speed == curr->speed && prim->duplex <= curr->duplex) return curr; return prim; case BOND_PRI_RESELECT_FAILURE: return curr; default: netdev_err(bond->dev, "impossible primary_reselect %d\n", bond->params.primary_reselect); return curr; } } /** * bond_find_best_slave - select the best available slave to be the active one * @bond: our bonding struct */ static struct slave *bond_find_best_slave(struct bonding *bond) { struct slave *slave, *bestslave = NULL; struct list_head *iter; int mintime = bond->params.updelay; slave = bond_choose_primary_or_current(bond); if (slave) return slave; bond_for_each_slave(bond, slave, iter) { if (slave->link == BOND_LINK_UP) return slave; if (slave->link == BOND_LINK_BACK && bond_slave_is_up(slave) && slave->delay < mintime) { mintime = slave->delay; bestslave = slave; } } return bestslave; } /* must be called in RCU critical section or with RTNL held */ static bool bond_should_notify_peers(struct bonding *bond) { struct slave *slave = rcu_dereference_rtnl(bond->curr_active_slave); if (!slave || !bond->send_peer_notif || bond->send_peer_notif % max(1, bond->params.peer_notif_delay) != 0 || !netif_carrier_ok(bond->dev) || test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state)) return false; netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n", slave ? slave->dev->name : "NULL"); return true; } /** * bond_change_active_slave - change the active slave into the specified one * @bond: our bonding struct * @new_active: the new slave to make the active one * * Set the new slave to the bond's settings and unset them on the old * curr_active_slave. * Setting include flags, mc-list, promiscuity, allmulti, etc. * * If @new's link state is %BOND_LINK_BACK we'll set it to %BOND_LINK_UP, * because it is apparently the best available slave we have, even though its * updelay hasn't timed out yet. * * Caller must hold RTNL. */ void bond_change_active_slave(struct bonding *bond, struct slave *new_active) { struct slave *old_active; ASSERT_RTNL(); old_active = rtnl_dereference(bond->curr_active_slave); if (old_active == new_active) return; #ifdef CONFIG_XFRM_OFFLOAD bond_ipsec_del_sa_all(bond); #endif /* CONFIG_XFRM_OFFLOAD */ if (new_active) { new_active->last_link_up = jiffies; if (new_active->link == BOND_LINK_BACK) { if (bond_uses_primary(bond)) { slave_info(bond->dev, new_active->dev, "making interface the new active one %d ms earlier\n", (bond->params.updelay - new_active->delay) * bond->params.miimon); } new_active->delay = 0; bond_set_slave_link_state(new_active, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_handle_link_change(new_active, BOND_LINK_UP); if (bond_is_lb(bond)) bond_alb_handle_link_change(bond, new_active, BOND_LINK_UP); } else { if (bond_uses_primary(bond)) slave_info(bond->dev, new_active->dev, "making interface the new active one\n"); } } if (bond_uses_primary(bond)) bond_hw_addr_swap(bond, new_active, old_active); if (bond_is_lb(bond)) { bond_alb_handle_active_change(bond, new_active); if (old_active) bond_set_slave_inactive_flags(old_active, BOND_SLAVE_NOTIFY_NOW); if (new_active) bond_set_slave_active_flags(new_active, BOND_SLAVE_NOTIFY_NOW); } else { rcu_assign_pointer(bond->curr_active_slave, new_active); } if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) { if (old_active) bond_set_slave_inactive_flags(old_active, BOND_SLAVE_NOTIFY_NOW); if (new_active) { bool should_notify_peers = false; bond_set_slave_active_flags(new_active, BOND_SLAVE_NOTIFY_NOW); if (bond->params.fail_over_mac) bond_do_fail_over_mac(bond, new_active, old_active); if (netif_running(bond->dev)) { bond->send_peer_notif = bond->params.num_peer_notif * max(1, bond->params.peer_notif_delay); should_notify_peers = bond_should_notify_peers(bond); } call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, bond->dev); if (should_notify_peers) { bond->send_peer_notif--; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); } } } #ifdef CONFIG_XFRM_OFFLOAD bond_ipsec_add_sa_all(bond); #endif /* CONFIG_XFRM_OFFLOAD */ /* resend IGMP joins since active slave has changed or * all were sent on curr_active_slave. * resend only if bond is brought up with the affected * bonding modes and the retransmission is enabled */ if (netif_running(bond->dev) && (bond->params.resend_igmp > 0) && ((bond_uses_primary(bond) && new_active) || BOND_MODE(bond) == BOND_MODE_ROUNDROBIN)) { bond->igmp_retrans = bond->params.resend_igmp; queue_delayed_work(bond->wq, &bond->mcast_work, 1); } } /** * bond_select_active_slave - select a new active slave, if needed * @bond: our bonding struct * * This functions should be called when one of the following occurs: * - The old curr_active_slave has been released or lost its link. * - The primary_slave has got its link back. * - A slave has got its link back and there's no old curr_active_slave. * * Caller must hold RTNL. */ void bond_select_active_slave(struct bonding *bond) { struct slave *best_slave; int rv; ASSERT_RTNL(); best_slave = bond_find_best_slave(bond); if (best_slave != rtnl_dereference(bond->curr_active_slave)) { bond_change_active_slave(bond, best_slave); rv = bond_set_carrier(bond); if (!rv) return; if (netif_carrier_ok(bond->dev)) netdev_info(bond->dev, "active interface up!\n"); else netdev_info(bond->dev, "now running without any active interface!\n"); } } #ifdef CONFIG_NET_POLL_CONTROLLER static inline int slave_enable_netpoll(struct slave *slave) { struct netpoll *np; int err = 0; np = kzalloc(sizeof(*np), GFP_KERNEL); err = -ENOMEM; if (!np) goto out; err = __netpoll_setup(np, slave->dev); if (err) { kfree(np); goto out; } slave->np = np; out: return err; } static inline void slave_disable_netpoll(struct slave *slave) { struct netpoll *np = slave->np; if (!np) return; slave->np = NULL; __netpoll_free(np); } static void bond_poll_controller(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave = NULL; struct list_head *iter; struct ad_info ad_info; if (BOND_MODE(bond) == BOND_MODE_8023AD) if (bond_3ad_get_active_agg_info(bond, &ad_info)) return; bond_for_each_slave_rcu(bond, slave, iter) { if (!bond_slave_is_up(slave)) continue; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct aggregator *agg = SLAVE_AD_INFO(slave)->port.aggregator; if (agg && agg->aggregator_identifier != ad_info.aggregator_id) continue; } netpoll_poll_dev(slave->dev); } } static void bond_netpoll_cleanup(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; bond_for_each_slave(bond, slave, iter) if (bond_slave_is_up(slave)) slave_disable_netpoll(slave); } static int bond_netpoll_setup(struct net_device *dev, struct netpoll_info *ni) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; struct slave *slave; int err = 0; bond_for_each_slave(bond, slave, iter) { err = slave_enable_netpoll(slave); if (err) { bond_netpoll_cleanup(dev); break; } } return err; } #else static inline int slave_enable_netpoll(struct slave *slave) { return 0; } static inline void slave_disable_netpoll(struct slave *slave) { } static void bond_netpoll_cleanup(struct net_device *bond_dev) { } #endif /*---------------------------------- IOCTL ----------------------------------*/ static netdev_features_t bond_fix_features(struct net_device *dev, netdev_features_t features) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; netdev_features_t mask; struct slave *slave; #if IS_ENABLED(CONFIG_TLS_DEVICE) if (bond_sk_check(bond)) features |= BOND_TLS_FEATURES; else features &= ~BOND_TLS_FEATURES; #endif mask = features; features &= ~NETIF_F_ONE_FOR_ALL; features |= NETIF_F_ALL_FOR_ALL; bond_for_each_slave(bond, slave, iter) { features = netdev_increment_features(features, slave->dev->features, mask); } features = netdev_add_tso_features(features, mask); return features; } #define BOND_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_ENCAP_ALL | \ NETIF_F_HIGHDMA | NETIF_F_LRO) #define BOND_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE) #define BOND_MPLS_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_GSO_SOFTWARE) static void bond_compute_features(struct bonding *bond) { unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; netdev_features_t vlan_features = BOND_VLAN_FEATURES; netdev_features_t enc_features = BOND_ENC_FEATURES; #ifdef CONFIG_XFRM_OFFLOAD netdev_features_t xfrm_features = BOND_XFRM_FEATURES; #endif /* CONFIG_XFRM_OFFLOAD */ netdev_features_t mpls_features = BOND_MPLS_FEATURES; struct net_device *bond_dev = bond->dev; struct list_head *iter; struct slave *slave; unsigned short max_hard_header_len = ETH_HLEN; unsigned int gso_max_size = GSO_MAX_SIZE; u16 gso_max_segs = GSO_MAX_SEGS; if (!bond_has_slaves(bond)) goto done; vlan_features &= NETIF_F_ALL_FOR_ALL; mpls_features &= NETIF_F_ALL_FOR_ALL; bond_for_each_slave(bond, slave, iter) { vlan_features = netdev_increment_features(vlan_features, slave->dev->vlan_features, BOND_VLAN_FEATURES); enc_features = netdev_increment_features(enc_features, slave->dev->hw_enc_features, BOND_ENC_FEATURES); #ifdef CONFIG_XFRM_OFFLOAD xfrm_features = netdev_increment_features(xfrm_features, slave->dev->hw_enc_features, BOND_XFRM_FEATURES); #endif /* CONFIG_XFRM_OFFLOAD */ mpls_features = netdev_increment_features(mpls_features, slave->dev->mpls_features, BOND_MPLS_FEATURES); dst_release_flag &= slave->dev->priv_flags; if (slave->dev->hard_header_len > max_hard_header_len) max_hard_header_len = slave->dev->hard_header_len; gso_max_size = min(gso_max_size, slave->dev->gso_max_size); gso_max_segs = min(gso_max_segs, slave->dev->gso_max_segs); } bond_dev->hard_header_len = max_hard_header_len; done: bond_dev->vlan_features = vlan_features; bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; #ifdef CONFIG_XFRM_OFFLOAD bond_dev->hw_enc_features |= xfrm_features; #endif /* CONFIG_XFRM_OFFLOAD */ bond_dev->mpls_features = mpls_features; bond_dev->gso_max_segs = gso_max_segs; netif_set_gso_max_size(bond_dev, gso_max_size); bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE; netdev_change_features(bond_dev); } static void bond_setup_by_slave(struct net_device *bond_dev, struct net_device *slave_dev) { bool was_up = !!(bond_dev->flags & IFF_UP); dev_close(bond_dev); bond_dev->header_ops = slave_dev->header_ops; bond_dev->type = slave_dev->type; bond_dev->hard_header_len = slave_dev->hard_header_len; bond_dev->needed_headroom = slave_dev->needed_headroom; bond_dev->addr_len = slave_dev->addr_len; memcpy(bond_dev->broadcast, slave_dev->broadcast, slave_dev->addr_len); if (slave_dev->flags & IFF_POINTOPOINT) { bond_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); bond_dev->flags |= (IFF_POINTOPOINT | IFF_NOARP); } if (was_up) dev_open(bond_dev, NULL); } /* On bonding slaves other than the currently active slave, suppress * duplicates except for alb non-mcast/bcast. */ static bool bond_should_deliver_exact_match(struct sk_buff *skb, struct slave *slave, struct bonding *bond) { if (bond_is_slave_inactive(slave)) { if (BOND_MODE(bond) == BOND_MODE_ALB && skb->pkt_type != PACKET_BROADCAST && skb->pkt_type != PACKET_MULTICAST) return false; return true; } return false; } static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct slave *slave; struct bonding *bond; int (*recv_probe)(const struct sk_buff *, struct bonding *, struct slave *); int ret = RX_HANDLER_ANOTHER; skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return RX_HANDLER_CONSUMED; *pskb = skb; slave = bond_slave_get_rcu(skb->dev); bond = slave->bond; recv_probe = READ_ONCE(bond->recv_probe); if (recv_probe) { ret = recv_probe(skb, bond, slave); if (ret == RX_HANDLER_CONSUMED) { consume_skb(skb); return ret; } } /* * For packets determined by bond_should_deliver_exact_match() call to * be suppressed we want to make an exception for link-local packets. * This is necessary for e.g. LLDP daemons to be able to monitor * inactive slave links without being forced to bind to them * explicitly. * * At the same time, packets that are passed to the bonding master * (including link-local ones) can have their originating interface * determined via PACKET_ORIGDEV socket option. */ if (bond_should_deliver_exact_match(skb, slave, bond)) { if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) return RX_HANDLER_PASS; return RX_HANDLER_EXACT; } skb->dev = bond->dev; if (BOND_MODE(bond) == BOND_MODE_ALB && netif_is_bridge_port(bond->dev) && skb->pkt_type == PACKET_HOST) { if (unlikely(skb_cow_head(skb, skb->data - skb_mac_header(skb)))) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } bond_hw_addr_copy(eth_hdr(skb)->h_dest, bond->dev->dev_addr, bond->dev->addr_len); } return ret; } static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond) { switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: return NETDEV_LAG_TX_TYPE_ROUNDROBIN; case BOND_MODE_ACTIVEBACKUP: return NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; case BOND_MODE_BROADCAST: return NETDEV_LAG_TX_TYPE_BROADCAST; case BOND_MODE_XOR: case BOND_MODE_8023AD: return NETDEV_LAG_TX_TYPE_HASH; default: return NETDEV_LAG_TX_TYPE_UNKNOWN; } } static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond, enum netdev_lag_tx_type type) { if (type != NETDEV_LAG_TX_TYPE_HASH) return NETDEV_LAG_HASH_NONE; switch (bond->params.xmit_policy) { case BOND_XMIT_POLICY_LAYER2: return NETDEV_LAG_HASH_L2; case BOND_XMIT_POLICY_LAYER34: return NETDEV_LAG_HASH_L34; case BOND_XMIT_POLICY_LAYER23: return NETDEV_LAG_HASH_L23; case BOND_XMIT_POLICY_ENCAP23: return NETDEV_LAG_HASH_E23; case BOND_XMIT_POLICY_ENCAP34: return NETDEV_LAG_HASH_E34; case BOND_XMIT_POLICY_VLAN_SRCMAC: return NETDEV_LAG_HASH_VLAN_SRCMAC; default: return NETDEV_LAG_HASH_UNKNOWN; } } static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave, struct netlink_ext_ack *extack) { struct netdev_lag_upper_info lag_upper_info; enum netdev_lag_tx_type type; type = bond_lag_tx_type(bond); lag_upper_info.tx_type = type; lag_upper_info.hash_type = bond_lag_hash_type(bond, type); return netdev_master_upper_dev_link(slave->dev, bond->dev, slave, &lag_upper_info, extack); } static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave) { netdev_upper_dev_unlink(slave->dev, bond->dev); slave->dev->flags &= ~IFF_SLAVE; } static void slave_kobj_release(struct kobject *kobj) { struct slave *slave = to_slave(kobj); struct bonding *bond = bond_get_bond_by_slave(slave); cancel_delayed_work_sync(&slave->notify_work); if (BOND_MODE(bond) == BOND_MODE_8023AD) kfree(SLAVE_AD_INFO(slave)); kfree(slave); } static struct kobj_type slave_ktype = { .release = slave_kobj_release, #ifdef CONFIG_SYSFS .sysfs_ops = &slave_sysfs_ops, #endif }; static int bond_kobj_init(struct slave *slave) { int err; err = kobject_init_and_add(&slave->kobj, &slave_ktype, &(slave->dev->dev.kobj), "bonding_slave"); if (err) kobject_put(&slave->kobj); return err; } static struct slave *bond_alloc_slave(struct bonding *bond, struct net_device *slave_dev) { struct slave *slave = NULL; slave = kzalloc(sizeof(*slave), GFP_KERNEL); if (!slave) return NULL; slave->bond = bond; slave->dev = slave_dev; INIT_DELAYED_WORK(&slave->notify_work, bond_netdev_notify_work); if (bond_kobj_init(slave)) return NULL; if (BOND_MODE(bond) == BOND_MODE_8023AD) { SLAVE_AD_INFO(slave) = kzalloc(sizeof(struct ad_slave_info), GFP_KERNEL); if (!SLAVE_AD_INFO(slave)) { kobject_put(&slave->kobj); return NULL; } } return slave; } static void bond_fill_ifbond(struct bonding *bond, struct ifbond *info) { info->bond_mode = BOND_MODE(bond); info->miimon = bond->params.miimon; info->num_slaves = bond->slave_cnt; } static void bond_fill_ifslave(struct slave *slave, struct ifslave *info) { strcpy(info->slave_name, slave->dev->name); info->link = slave->link; info->state = bond_slave_state(slave); info->link_failure_count = slave->link_failure_count; } static void bond_netdev_notify_work(struct work_struct *_work) { struct slave *slave = container_of(_work, struct slave, notify_work.work); if (rtnl_trylock()) { struct netdev_bonding_info binfo; bond_fill_ifslave(slave, &binfo.slave); bond_fill_ifbond(slave->bond, &binfo.master); netdev_bonding_info_change(slave->dev, &binfo); rtnl_unlock(); } else { queue_delayed_work(slave->bond->wq, &slave->notify_work, 1); } } void bond_queue_slave_event(struct slave *slave) { queue_delayed_work(slave->bond->wq, &slave->notify_work, 0); } void bond_lower_state_changed(struct slave *slave) { struct netdev_lag_lower_state_info info; info.link_up = slave->link == BOND_LINK_UP || slave->link == BOND_LINK_FAIL; info.tx_enabled = bond_is_active_slave(slave); netdev_lower_state_changed(slave->dev, &info); } #define BOND_NL_ERR(bond_dev, extack, errmsg) do { \ if (extack) \ NL_SET_ERR_MSG(extack, errmsg); \ else \ netdev_err(bond_dev, "Error: %s\n", errmsg); \ } while (0) #define SLAVE_NL_ERR(bond_dev, slave_dev, extack, errmsg) do { \ if (extack) \ NL_SET_ERR_MSG(extack, errmsg); \ else \ slave_err(bond_dev, slave_dev, "Error: %s\n", errmsg); \ } while (0) /* The bonding driver uses ether_setup() to convert a master bond device * to ARPHRD_ETHER, that resets the target netdevice's flags so we always * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE and IFF_UP * if they were set */ static void bond_ether_setup(struct net_device *bond_dev) { unsigned int flags = bond_dev->flags & (IFF_SLAVE | IFF_UP); ether_setup(bond_dev); bond_dev->flags |= IFF_MASTER | flags; bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; } /* enslave device <slave> to bond device <master> */ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(bond_dev); const struct net_device_ops *slave_ops = slave_dev->netdev_ops; struct slave *new_slave = NULL, *prev_slave; struct sockaddr_storage ss; int link_reporting; int res = 0, i; if (slave_dev->flags & IFF_MASTER && !netif_is_bond_master(slave_dev)) { BOND_NL_ERR(bond_dev, extack, "Device type (master device) cannot be enslaved"); return -EPERM; } if (!bond->params.use_carrier && slave_dev->ethtool_ops->get_link == NULL && slave_ops->ndo_eth_ioctl == NULL) { slave_warn(bond_dev, slave_dev, "no link monitoring support\n"); } /* already in-use? */ if (netdev_is_rx_handler_busy(slave_dev)) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Device is in use and cannot be enslaved"); return -EBUSY; } if (bond_dev == slave_dev) { BOND_NL_ERR(bond_dev, extack, "Cannot enslave bond to itself."); return -EPERM; } /* vlan challenged mutual exclusion */ /* no need to lock since we're protected by rtnl_lock */ if (slave_dev->features & NETIF_F_VLAN_CHALLENGED) { slave_dbg(bond_dev, slave_dev, "is NETIF_F_VLAN_CHALLENGED\n"); if (vlan_uses_dev(bond_dev)) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Can not enslave VLAN challenged device to VLAN enabled bond"); return -EPERM; } else { slave_warn(bond_dev, slave_dev, "enslaved VLAN challenged slave. Adding VLANs will be blocked as long as it is part of bond.\n"); } } else { slave_dbg(bond_dev, slave_dev, "is !NETIF_F_VLAN_CHALLENGED\n"); } if (slave_dev->features & NETIF_F_HW_ESP) slave_dbg(bond_dev, slave_dev, "is esp-hw-offload capable\n"); /* Old ifenslave binaries are no longer supported. These can * be identified with moderate accuracy by the state of the slave: * the current ifenslave will set the interface down prior to * enslaving it; the old ifenslave will not. */ if (slave_dev->flags & IFF_UP) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Device can not be enslaved while up"); return -EPERM; } /* set bonding device ether type by slave - bonding netdevices are * created with ether_setup, so when the slave type is not ARPHRD_ETHER * there is a need to override some of the type dependent attribs/funcs. * * bond ether type mutual exclusion - don't allow slaves of dissimilar * ether type (eg ARPHRD_ETHER and ARPHRD_INFINIBAND) share the same bond */ if (!bond_has_slaves(bond)) { if (bond_dev->type != slave_dev->type) { slave_dbg(bond_dev, slave_dev, "change device type from %d to %d\n", bond_dev->type, slave_dev->type); res = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, bond_dev); res = notifier_to_errno(res); if (res) { slave_err(bond_dev, slave_dev, "refused to change device type\n"); return -EBUSY; } /* Flush unicast and multicast addresses */ dev_uc_flush(bond_dev); dev_mc_flush(bond_dev); if (slave_dev->type != ARPHRD_ETHER) bond_setup_by_slave(bond_dev, slave_dev); else bond_ether_setup(bond_dev); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, bond_dev); } } else if (bond_dev->type != slave_dev->type) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Device type is different from other slaves"); return -EINVAL; } if (slave_dev->type == ARPHRD_INFINIBAND && BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Only active-backup mode is supported for infiniband slaves"); res = -EOPNOTSUPP; goto err_undo_flags; } if (!slave_ops->ndo_set_mac_address || slave_dev->type == ARPHRD_INFINIBAND) { slave_warn(bond_dev, slave_dev, "The slave device specified does not support setting the MAC address\n"); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && bond->params.fail_over_mac != BOND_FOM_ACTIVE) { if (!bond_has_slaves(bond)) { bond->params.fail_over_mac = BOND_FOM_ACTIVE; slave_warn(bond_dev, slave_dev, "Setting fail_over_mac to active for active-backup mode\n"); } else { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Slave device does not support setting the MAC address, but fail_over_mac is not set to active"); res = -EOPNOTSUPP; goto err_undo_flags; } } } call_netdevice_notifiers(NETDEV_JOIN, slave_dev); /* If this is the first slave, then we need to set the master's hardware * address to be the same as the slave's. */ if (!bond_has_slaves(bond) && bond->dev->addr_assign_type == NET_ADDR_RANDOM) { res = bond_set_dev_addr(bond->dev, slave_dev); if (res) goto err_undo_flags; } new_slave = bond_alloc_slave(bond, slave_dev); if (!new_slave) { res = -ENOMEM; goto err_undo_flags; } /* Set the new_slave's queue_id to be zero. Queue ID mapping * is set via sysfs or module option if desired. */ new_slave->queue_id = 0; /* Save slave's original mtu and then set it to match the bond */ new_slave->original_mtu = slave_dev->mtu; res = dev_set_mtu(slave_dev, bond->dev->mtu); if (res) { slave_err(bond_dev, slave_dev, "Error %d calling dev_set_mtu\n", res); goto err_free; } /* Save slave's original ("permanent") mac address for modes * that need it, and for restoring it upon release, and then * set it to the master's address */ bond_hw_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr, slave_dev->addr_len); if (!bond->params.fail_over_mac || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* Set slave to master's mac address. The application already * set the master's mac address to that of the first slave */ memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len); ss.ss_family = slave_dev->type; res = dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, extack); if (res) { slave_err(bond_dev, slave_dev, "Error %d calling set_mac_address\n", res); goto err_restore_mtu; } } /* set slave flag before open to prevent IPv6 addrconf */ slave_dev->flags |= IFF_SLAVE; /* open the slave since the application closed it */ res = dev_open(slave_dev, extack); if (res) { slave_err(bond_dev, slave_dev, "Opening slave failed\n"); goto err_restore_mac; } slave_dev->priv_flags |= IFF_BONDING; /* initialize slave stats */ dev_get_stats(new_slave->dev, &new_slave->slave_stats); if (bond_is_lb(bond)) { /* bond_alb_init_slave() must be called before all other stages since * it might fail and we do not want to have to undo everything */ res = bond_alb_init_slave(bond, new_slave); if (res) goto err_close; } res = vlan_vids_add_by_dev(slave_dev, bond_dev); if (res) { slave_err(bond_dev, slave_dev, "Couldn't add bond vlan ids\n"); goto err_close; } prev_slave = bond_last_slave(bond); new_slave->delay = 0; new_slave->link_failure_count = 0; if (bond_update_speed_duplex(new_slave) && bond_needs_speed_duplex(bond)) new_slave->link = BOND_LINK_DOWN; new_slave->last_rx = jiffies - (msecs_to_jiffies(bond->params.arp_interval) + 1); for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) new_slave->target_last_arp_rx[i] = new_slave->last_rx; if (bond->params.miimon && !bond->params.use_carrier) { link_reporting = bond_check_dev_link(bond, slave_dev, 1); if ((link_reporting == -1) && !bond->params.arp_interval) { /* miimon is set but a bonded network driver * does not support ETHTOOL/MII and * arp_interval is not set. Note: if * use_carrier is enabled, we will never go * here (because netif_carrier is always * supported); thus, we don't need to change * the messages for netif_carrier. */ slave_warn(bond_dev, slave_dev, "MII and ETHTOOL support not available for slave, and arp_interval/arp_ip_target module parameters not specified, thus bonding will not detect link failures! see bonding.txt for details\n"); } else if (link_reporting == -1) { /* unable get link status using mii/ethtool */ slave_warn(bond_dev, slave_dev, "can't get link status from slave; the network driver associated with this interface does not support MII or ETHTOOL link status reporting, thus miimon has no effect on this interface\n"); } } /* check for initial state */ new_slave->link = BOND_LINK_NOCHANGE; if (bond->params.miimon) { if (bond_check_dev_link(bond, slave_dev, 0) == BMSR_LSTATUS) { if (bond->params.updelay) { bond_set_slave_link_state(new_slave, BOND_LINK_BACK, BOND_SLAVE_NOTIFY_NOW); new_slave->delay = bond->params.updelay; } else { bond_set_slave_link_state(new_slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); } } else { bond_set_slave_link_state(new_slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_NOW); } } else if (bond->params.arp_interval) { bond_set_slave_link_state(new_slave, (netif_carrier_ok(slave_dev) ? BOND_LINK_UP : BOND_LINK_DOWN), BOND_SLAVE_NOTIFY_NOW); } else { bond_set_slave_link_state(new_slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); } if (new_slave->link != BOND_LINK_DOWN) new_slave->last_link_up = jiffies; slave_dbg(bond_dev, slave_dev, "Initial state of slave is BOND_LINK_%s\n", new_slave->link == BOND_LINK_DOWN ? "DOWN" : (new_slave->link == BOND_LINK_UP ? "UP" : "BACK")); if (bond_uses_primary(bond) && bond->params.primary[0]) { /* if there is a primary slave, remember it */ if (strcmp(bond->params.primary, new_slave->dev->name) == 0) { rcu_assign_pointer(bond->primary_slave, new_slave); bond->force_primary = true; } } switch (BOND_MODE(bond)) { case BOND_MODE_ACTIVEBACKUP: bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); break; case BOND_MODE_8023AD: /* in 802.3ad mode, the internal mechanism * will activate the slaves in the selected * aggregator */ bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); /* if this is the first slave */ if (!prev_slave) { SLAVE_AD_INFO(new_slave)->id = 1; /* Initialize AD with the number of times that the AD timer is called in 1 second * can be called only after the mac address of the bond is set */ bond_3ad_initialize(bond, 1000/AD_TIMER_INTERVAL); } else { SLAVE_AD_INFO(new_slave)->id = SLAVE_AD_INFO(prev_slave)->id + 1; } bond_3ad_bind_slave(new_slave); break; case BOND_MODE_TLB: case BOND_MODE_ALB: bond_set_active_slave(new_slave); bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); break; default: slave_dbg(bond_dev, slave_dev, "This slave is always active in trunk mode\n"); /* always active in trunk mode */ bond_set_active_slave(new_slave); /* In trunking mode there is little meaning to curr_active_slave * anyway (it holds no special properties of the bond device), * so we can change it without calling change_active_interface() */ if (!rcu_access_pointer(bond->curr_active_slave) && new_slave->link == BOND_LINK_UP) rcu_assign_pointer(bond->curr_active_slave, new_slave); break; } /* switch(bond_mode) */ #ifdef CONFIG_NET_POLL_CONTROLLER if (bond->dev->npinfo) { if (slave_enable_netpoll(new_slave)) { slave_info(bond_dev, slave_dev, "master_dev is using netpoll, but new slave device does not support netpoll\n"); res = -EBUSY; goto err_detach; } } #endif if (!(bond_dev->features & NETIF_F_LRO)) dev_disable_lro(slave_dev); res = netdev_rx_handler_register(slave_dev, bond_handle_frame, new_slave); if (res) { slave_dbg(bond_dev, slave_dev, "Error %d calling netdev_rx_handler_register\n", res); goto err_detach; } res = bond_master_upper_dev_link(bond, new_slave, extack); if (res) { slave_dbg(bond_dev, slave_dev, "Error %d calling bond_master_upper_dev_link\n", res); goto err_unregister; } bond_lower_state_changed(new_slave); res = bond_sysfs_slave_add(new_slave); if (res) { slave_dbg(bond_dev, slave_dev, "Error %d calling bond_sysfs_slave_add\n", res); goto err_upper_unlink; } /* If the mode uses primary, then the following is handled by * bond_change_active_slave(). */ if (!bond_uses_primary(bond)) { /* set promiscuity level to new slave */ if (bond_dev->flags & IFF_PROMISC) { res = dev_set_promiscuity(slave_dev, 1); if (res) goto err_sysfs_del; } /* set allmulti level to new slave */ if (bond_dev->flags & IFF_ALLMULTI) { res = dev_set_allmulti(slave_dev, 1); if (res) { if (bond_dev->flags & IFF_PROMISC) dev_set_promiscuity(slave_dev, -1); goto err_sysfs_del; } } if (bond_dev->flags & IFF_UP) { netif_addr_lock_bh(bond_dev); dev_mc_sync_multiple(slave_dev, bond_dev); dev_uc_sync_multiple(slave_dev, bond_dev); netif_addr_unlock_bh(bond_dev); if (BOND_MODE(bond) == BOND_MODE_8023AD) dev_mc_add(slave_dev, lacpdu_mcast_addr); } } bond->slave_cnt++; bond_compute_features(bond); bond_set_carrier(bond); if (bond_uses_primary(bond)) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); if (!slave_dev->netdev_ops->ndo_bpf || !slave_dev->netdev_ops->ndo_xdp_xmit) { if (bond->xdp_prog) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Slave does not support XDP"); res = -EOPNOTSUPP; goto err_sysfs_del; } } else if (bond->xdp_prog) { struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, .prog = bond->xdp_prog, .extack = extack, }; if (dev_xdp_prog_count(slave_dev) > 0) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Slave has XDP program loaded, please unload before enslaving"); res = -EOPNOTSUPP; goto err_sysfs_del; } res = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp); if (res < 0) { /* ndo_bpf() sets extack error message */ slave_dbg(bond_dev, slave_dev, "Error %d calling ndo_bpf\n", res); goto err_sysfs_del; } if (bond->xdp_prog) bpf_prog_inc(bond->xdp_prog); } slave_info(bond_dev, slave_dev, "Enslaving as %s interface with %s link\n", bond_is_active_slave(new_slave) ? "an active" : "a backup", new_slave->link != BOND_LINK_DOWN ? "an up" : "a down"); /* enslave is successful */ bond_queue_slave_event(new_slave); return 0; /* Undo stages on error */ err_sysfs_del: bond_sysfs_slave_del(new_slave); err_upper_unlink: bond_upper_dev_unlink(bond, new_slave); err_unregister: netdev_rx_handler_unregister(slave_dev); err_detach: vlan_vids_del_by_dev(slave_dev, bond_dev); if (rcu_access_pointer(bond->primary_slave) == new_slave) RCU_INIT_POINTER(bond->primary_slave, NULL); if (rcu_access_pointer(bond->curr_active_slave) == new_slave) { block_netpoll_tx(); bond_change_active_slave(bond, NULL); bond_select_active_slave(bond); unblock_netpoll_tx(); } /* either primary_slave or curr_active_slave might've changed */ synchronize_rcu(); slave_disable_netpoll(new_slave); err_close: if (!netif_is_bond_master(slave_dev)) slave_dev->priv_flags &= ~IFF_BONDING; dev_close(slave_dev); err_restore_mac: slave_dev->flags &= ~IFF_SLAVE; if (!bond->params.fail_over_mac || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* XXX TODO - fom follow mode needs to change master's * MAC if this slave's MAC is in use by the bond, or at * least print a warning. */ bond_hw_addr_copy(ss.__data, new_slave->perm_hwaddr, new_slave->dev->addr_len); ss.ss_family = slave_dev->type; dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL); } err_restore_mtu: dev_set_mtu(slave_dev, new_slave->original_mtu); err_free: kobject_put(&new_slave->kobj); err_undo_flags: /* Enslave of first slave has failed and we need to fix master's mac */ if (!bond_has_slaves(bond)) { if (ether_addr_equal_64bits(bond_dev->dev_addr, slave_dev->dev_addr)) eth_hw_addr_random(bond_dev); if (bond_dev->type != ARPHRD_ETHER) { dev_close(bond_dev); bond_ether_setup(bond_dev); } } return res; } /* Try to release the slave device <slave> from the bond device <master> * It is legal to access curr_active_slave without a lock because all the function * is RTNL-locked. If "all" is true it means that the function is being called * while destroying a bond interface and all slaves are being released. * * The rules for slave state should be: * for Active/Backup: * Active stays on all backups go down * for Bonded connections: * The first up interface should be left on and all others downed. */ static int __bond_release_one(struct net_device *bond_dev, struct net_device *slave_dev, bool all, bool unregister) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *oldcurrent; struct sockaddr_storage ss; int old_flags = bond_dev->flags; netdev_features_t old_features = bond_dev->features; /* slave is not a slave or master is not master of this slave */ if (!(slave_dev->flags & IFF_SLAVE) || !netdev_has_upper_dev(slave_dev, bond_dev)) { slave_dbg(bond_dev, slave_dev, "cannot release slave\n"); return -EINVAL; } block_netpoll_tx(); slave = bond_get_slave_by_dev(bond, slave_dev); if (!slave) { /* not a slave of this bond */ slave_info(bond_dev, slave_dev, "interface not enslaved\n"); unblock_netpoll_tx(); return -EINVAL; } bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); bond_sysfs_slave_del(slave); /* recompute stats just before removing the slave */ bond_get_stats(bond->dev, &bond->bond_stats); if (bond->xdp_prog) { struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, .prog = NULL, .extack = NULL, }; if (slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp)) slave_warn(bond_dev, slave_dev, "failed to unload XDP program\n"); } /* unregister rx_handler early so bond_handle_frame wouldn't be called * for this slave anymore. */ netdev_rx_handler_unregister(slave_dev); if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_unbind_slave(slave); bond_upper_dev_unlink(bond, slave); if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, slave); slave_info(bond_dev, slave_dev, "Releasing %s interface\n", bond_is_active_slave(slave) ? "active" : "backup"); oldcurrent = rcu_access_pointer(bond->curr_active_slave); RCU_INIT_POINTER(bond->current_arp_slave, NULL); if (!all && (!bond->params.fail_over_mac || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) { if (ether_addr_equal_64bits(bond_dev->dev_addr, slave->perm_hwaddr) && bond_has_slaves(bond)) slave_warn(bond_dev, slave_dev, "the permanent HWaddr of slave - %pM - is still in use by bond - set the HWaddr of slave to a different address to avoid conflicts\n", slave->perm_hwaddr); } if (rtnl_dereference(bond->primary_slave) == slave) RCU_INIT_POINTER(bond->primary_slave, NULL); if (oldcurrent == slave) bond_change_active_slave(bond, NULL); if (bond_is_lb(bond)) { /* Must be called only after the slave has been * detached from the list and the curr_active_slave * has been cleared (if our_slave == old_current), * but before a new active slave is selected. */ bond_alb_deinit_slave(bond, slave); } if (all) { RCU_INIT_POINTER(bond->curr_active_slave, NULL); } else if (oldcurrent == slave) { /* Note that we hold RTNL over this sequence, so there * is no concern that another slave add/remove event * will interfere. */ bond_select_active_slave(bond); } bond_set_carrier(bond); if (!bond_has_slaves(bond)) eth_hw_addr_random(bond_dev); unblock_netpoll_tx(); synchronize_rcu(); bond->slave_cnt--; if (!bond_has_slaves(bond)) { call_netdevice_notifiers(NETDEV_CHANGEADDR, bond->dev); call_netdevice_notifiers(NETDEV_RELEASE, bond->dev); } bond_compute_features(bond); if (!(bond_dev->features & NETIF_F_VLAN_CHALLENGED) && (old_features & NETIF_F_VLAN_CHALLENGED)) slave_info(bond_dev, slave_dev, "last VLAN challenged slave left bond - VLAN blocking is removed\n"); vlan_vids_del_by_dev(slave_dev, bond_dev); /* If the mode uses primary, then this case was handled above by * bond_change_active_slave(..., NULL) */ if (!bond_uses_primary(bond)) { /* unset promiscuity level from slave * NOTE: The NETDEV_CHANGEADDR call above may change the value * of the IFF_PROMISC flag in the bond_dev, but we need the * value of that flag before that change, as that was the value * when this slave was attached, so we cache at the start of the * function and use it here. Same goes for ALLMULTI below */ if (old_flags & IFF_PROMISC) dev_set_promiscuity(slave_dev, -1); /* unset allmulti level from slave */ if (old_flags & IFF_ALLMULTI) dev_set_allmulti(slave_dev, -1); if (old_flags & IFF_UP) bond_hw_addr_flush(bond_dev, slave_dev); } slave_disable_netpoll(slave); /* close slave before restoring its mac address */ dev_close(slave_dev); if (bond->params.fail_over_mac != BOND_FOM_ACTIVE || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* restore original ("permanent") mac address */ bond_hw_addr_copy(ss.__data, slave->perm_hwaddr, slave->dev->addr_len); ss.ss_family = slave_dev->type; dev_set_mac_address(slave_dev, (struct sockaddr *)&ss, NULL); } if (unregister) __dev_set_mtu(slave_dev, slave->original_mtu); else dev_set_mtu(slave_dev, slave->original_mtu); if (!netif_is_bond_master(slave_dev)) slave_dev->priv_flags &= ~IFF_BONDING; kobject_put(&slave->kobj); return 0; } /* A wrapper used because of ndo_del_link */ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev) { return __bond_release_one(bond_dev, slave_dev, false, false); } /* First release a slave and then destroy the bond if no more slaves are left. * Must be under rtnl_lock when this function is called. */ static int bond_release_and_destroy(struct net_device *bond_dev, struct net_device *slave_dev) { struct bonding *bond = netdev_priv(bond_dev); int ret; ret = __bond_release_one(bond_dev, slave_dev, false, true); if (ret == 0 && !bond_has_slaves(bond) && bond_dev->reg_state != NETREG_UNREGISTERING) { bond_dev->priv_flags |= IFF_DISABLE_NETPOLL; netdev_info(bond_dev, "Destroying bond\n"); bond_remove_proc_entry(bond); unregister_netdevice(bond_dev); } return ret; } static void bond_info_query(struct net_device *bond_dev, struct ifbond *info) { struct bonding *bond = netdev_priv(bond_dev); bond_fill_ifbond(bond, info); } static int bond_slave_info_query(struct net_device *bond_dev, struct ifslave *info) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; int i = 0, res = -ENODEV; struct slave *slave; bond_for_each_slave(bond, slave, iter) { if (i++ == (int)info->slave_id) { res = 0; bond_fill_ifslave(slave, info); break; } } return res; } /*-------------------------------- Monitoring -------------------------------*/ /* called with rcu_read_lock() */ static int bond_miimon_inspect(struct bonding *bond) { bool ignore_updelay = false; int link_state, commit = 0; struct list_head *iter; struct slave *slave; if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) { ignore_updelay = !rcu_dereference(bond->curr_active_slave); } else { struct bond_up_slave *usable_slaves; usable_slaves = rcu_dereference(bond->usable_slaves); if (usable_slaves && usable_slaves->count == 0) ignore_updelay = true; } bond_for_each_slave_rcu(bond, slave, iter) { bond_propose_link_state(slave, BOND_LINK_NOCHANGE); link_state = bond_check_dev_link(bond, slave->dev, 0); switch (slave->link) { case BOND_LINK_UP: if (link_state) continue; bond_propose_link_state(slave, BOND_LINK_FAIL); commit++; slave->delay = bond->params.downdelay; if (slave->delay) { slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n", (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) ? (bond_is_active_slave(slave) ? "active " : "backup ") : "", bond->params.downdelay * bond->params.miimon); } fallthrough; case BOND_LINK_FAIL: if (link_state) { /* recovered before downdelay expired */ bond_propose_link_state(slave, BOND_LINK_UP); slave->last_link_up = jiffies; slave_info(bond->dev, slave->dev, "link status up again after %d ms\n", (bond->params.downdelay - slave->delay) * bond->params.miimon); commit++; continue; } if (slave->delay <= 0) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; continue; } slave->delay--; break; case BOND_LINK_DOWN: if (!link_state) continue; bond_propose_link_state(slave, BOND_LINK_BACK); commit++; slave->delay = bond->params.updelay; if (slave->delay) { slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n", ignore_updelay ? 0 : bond->params.updelay * bond->params.miimon); } fallthrough; case BOND_LINK_BACK: if (!link_state) { bond_propose_link_state(slave, BOND_LINK_DOWN); slave_info(bond->dev, slave->dev, "link status down again after %d ms\n", (bond->params.updelay - slave->delay) * bond->params.miimon); commit++; continue; } if (ignore_updelay) slave->delay = 0; if (slave->delay <= 0) { bond_propose_link_state(slave, BOND_LINK_UP); commit++; ignore_updelay = false; continue; } slave->delay--; break; } } return commit; } static void bond_miimon_link_change(struct bonding *bond, struct slave *slave, char link) { switch (BOND_MODE(bond)) { case BOND_MODE_8023AD: bond_3ad_handle_link_change(slave, link); break; case BOND_MODE_TLB: case BOND_MODE_ALB: bond_alb_handle_link_change(bond, slave, link); break; case BOND_MODE_XOR: bond_update_slave_arr(bond, NULL); break; } } static void bond_miimon_commit(struct bonding *bond) { struct list_head *iter; struct slave *slave, *primary; bond_for_each_slave(bond, slave, iter) { switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: /* For 802.3ad mode, check current slave speed and * duplex again in case its port was disabled after * invalid speed/duplex reporting but recovered before * link monitoring could make a decision on the actual * link status */ if (BOND_MODE(bond) == BOND_MODE_8023AD && slave->link == BOND_LINK_UP) bond_3ad_adapter_speed_duplex_changed(slave); continue; case BOND_LINK_UP: if (bond_update_speed_duplex(slave) && bond_needs_speed_duplex(bond)) { slave->link = BOND_LINK_DOWN; if (net_ratelimit()) slave_warn(bond->dev, slave->dev, "failed to get link speed/duplex\n"); continue; } bond_set_slave_link_state(slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); slave->last_link_up = jiffies; primary = rtnl_dereference(bond->primary_slave); if (BOND_MODE(bond) == BOND_MODE_8023AD) { /* prevent it from being the active one */ bond_set_backup_slave(slave); } else if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* make it immediately active */ bond_set_active_slave(slave); } slave_info(bond->dev, slave->dev, "link status definitely up, %u Mbps %s duplex\n", slave->speed == SPEED_UNKNOWN ? 0 : slave->speed, slave->duplex ? "full" : "half"); bond_miimon_link_change(bond, slave, BOND_LINK_UP); if (!bond->curr_active_slave || slave == primary) goto do_failover; continue; case BOND_LINK_DOWN: if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; bond_set_slave_link_state(slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_NOW); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP || BOND_MODE(bond) == BOND_MODE_8023AD) bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n"); bond_miimon_link_change(bond, slave, BOND_LINK_DOWN); if (slave == rcu_access_pointer(bond->curr_active_slave)) goto do_failover; continue; default: slave_err(bond->dev, slave->dev, "invalid new link %d on slave\n", slave->link_new_state); bond_propose_link_state(slave, BOND_LINK_NOCHANGE); continue; } do_failover: block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } bond_set_carrier(bond); } /* bond_mii_monitor * * Really a wrapper that splits the mii monitor into two phases: an * inspection, then (if inspection indicates something needs to be done) * an acquisition of appropriate locks followed by a commit phase to * implement whatever link state changes are indicated. */ static void bond_mii_monitor(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, mii_work.work); bool should_notify_peers = false; bool commit; unsigned long delay; struct slave *slave; struct list_head *iter; delay = msecs_to_jiffies(bond->params.miimon); if (!bond_has_slaves(bond)) goto re_arm; rcu_read_lock(); should_notify_peers = bond_should_notify_peers(bond); commit = !!bond_miimon_inspect(bond); if (bond->send_peer_notif) { rcu_read_unlock(); if (rtnl_trylock()) { bond->send_peer_notif--; rtnl_unlock(); } } else { rcu_read_unlock(); } if (commit) { /* Race avoidance with bond_close cancel of workqueue */ if (!rtnl_trylock()) { delay = 1; should_notify_peers = false; goto re_arm; } bond_for_each_slave(bond, slave, iter) { bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER); } bond_miimon_commit(bond); rtnl_unlock(); /* might sleep, hold no other locks */ } re_arm: if (bond->params.miimon) queue_delayed_work(bond->wq, &bond->mii_work, delay); if (should_notify_peers) { if (!rtnl_trylock()) return; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); rtnl_unlock(); } } static int bond_upper_dev_walk(struct net_device *upper, struct netdev_nested_priv *priv) { __be32 ip = *(__be32 *)priv->data; return ip == bond_confirm_addr(upper, 0, ip); } static bool bond_has_this_ip(struct bonding *bond, __be32 ip) { struct netdev_nested_priv priv = { .data = (void *)&ip, }; bool ret = false; if (ip == bond_confirm_addr(bond->dev, 0, ip)) return true; rcu_read_lock(); if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &priv)) ret = true; rcu_read_unlock(); return ret; } /* We go to the (large) trouble of VLAN tagging ARP frames because * switches in VLAN mode (especially if ports are configured as * "native" to a VLAN) might not pass non-tagged frames. */ static void bond_arp_send(struct slave *slave, int arp_op, __be32 dest_ip, __be32 src_ip, struct bond_vlan_tag *tags) { struct sk_buff *skb; struct bond_vlan_tag *outer_tag = tags; struct net_device *slave_dev = slave->dev; struct net_device *bond_dev = slave->bond->dev; slave_dbg(bond_dev, slave_dev, "arp %d on slave: dst %pI4 src %pI4\n", arp_op, &dest_ip, &src_ip); skb = arp_create(arp_op, ETH_P_ARP, dest_ip, slave_dev, src_ip, NULL, slave_dev->dev_addr, NULL); if (!skb) { net_err_ratelimited("ARP packet allocation failed\n"); return; } if (!tags || tags->vlan_proto == VLAN_N_VID) goto xmit; tags++; /* Go through all the tags backwards and add them to the packet */ while (tags->vlan_proto != VLAN_N_VID) { if (!tags->vlan_id) { tags++; continue; } slave_dbg(bond_dev, slave_dev, "inner tag: proto %X vid %X\n", ntohs(outer_tag->vlan_proto), tags->vlan_id); skb = vlan_insert_tag_set_proto(skb, tags->vlan_proto, tags->vlan_id); if (!skb) { net_err_ratelimited("failed to insert inner VLAN tag\n"); return; } tags++; } /* Set the outer tag */ if (outer_tag->vlan_id) { slave_dbg(bond_dev, slave_dev, "outer tag: proto %X vid %X\n", ntohs(outer_tag->vlan_proto), outer_tag->vlan_id); __vlan_hwaccel_put_tag(skb, outer_tag->vlan_proto, outer_tag->vlan_id); } xmit: arp_xmit(skb); } /* Validate the device path between the @start_dev and the @end_dev. * The path is valid if the @end_dev is reachable through device * stacking. * When the path is validated, collect any vlan information in the * path. */ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, struct net_device *end_dev, int level) { struct bond_vlan_tag *tags; struct net_device *upper; struct list_head *iter; if (start_dev == end_dev) { tags = kcalloc(level + 1, sizeof(*tags), GFP_ATOMIC); if (!tags) return ERR_PTR(-ENOMEM); tags[level].vlan_proto = VLAN_N_VID; return tags; } netdev_for_each_upper_dev_rcu(start_dev, upper, iter) { tags = bond_verify_device_path(upper, end_dev, level + 1); if (IS_ERR_OR_NULL(tags)) { if (IS_ERR(tags)) return tags; continue; } if (is_vlan_dev(upper)) { tags[level].vlan_proto = vlan_dev_vlan_proto(upper); tags[level].vlan_id = vlan_dev_vlan_id(upper); } return tags; } return NULL; } static void bond_arp_send_all(struct bonding *bond, struct slave *slave) { struct rtable *rt; struct bond_vlan_tag *tags; __be32 *targets = bond->params.arp_targets, addr; int i; for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) { slave_dbg(bond->dev, slave->dev, "%s: target %pI4\n", __func__, &targets[i]); tags = NULL; /* Find out through which dev should the packet go */ rt = ip_route_output(dev_net(bond->dev), targets[i], 0, RTO_ONLINK, 0); if (IS_ERR(rt)) { /* there's no route to target - try to send arp * probe to generate any traffic (arp_validate=0) */ if (bond->params.arp_validate) pr_warn_once("%s: no route to arp_ip_target %pI4 and arp_validate is set\n", bond->dev->name, &targets[i]); bond_arp_send(slave, ARPOP_REQUEST, targets[i], 0, tags); continue; } /* bond device itself */ if (rt->dst.dev == bond->dev) goto found; rcu_read_lock(); tags = bond_verify_device_path(bond->dev, rt->dst.dev, 0); rcu_read_unlock(); if (!IS_ERR_OR_NULL(tags)) goto found; /* Not our device - skip */ slave_dbg(bond->dev, slave->dev, "no path to arp_ip_target %pI4 via rt.dev %s\n", &targets[i], rt->dst.dev ? rt->dst.dev->name : "NULL"); ip_rt_put(rt); continue; found: addr = bond_confirm_addr(rt->dst.dev, targets[i], 0); ip_rt_put(rt); bond_arp_send(slave, ARPOP_REQUEST, targets[i], addr, tags); kfree(tags); } } static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32 sip, __be32 tip) { int i; if (!sip || !bond_has_this_ip(bond, tip)) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 tip %pI4 not found\n", __func__, &sip, &tip); return; } i = bond_get_targets_ip(bond->params.arp_targets, sip); if (i == -1) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 not found in targets\n", __func__, &sip); return; } slave->last_rx = jiffies; slave->target_last_arp_rx[i] = jiffies; } int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct arphdr *arp = (struct arphdr *)skb->data; struct slave *curr_active_slave, *curr_arp_slave; unsigned char *arp_ptr; __be32 sip, tip; int is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP); unsigned int alen; if (!slave_do_arp_validate(bond, slave)) { if ((slave_do_arp_validate_only(bond) && is_arp) || !slave_do_arp_validate_only(bond)) slave->last_rx = jiffies; return RX_HANDLER_ANOTHER; } else if (!is_arp) { return RX_HANDLER_ANOTHER; } alen = arp_hdr_len(bond->dev); slave_dbg(bond->dev, slave->dev, "%s: skb->dev %s\n", __func__, skb->dev->name); if (alen > skb_headlen(skb)) { arp = kmalloc(alen, GFP_ATOMIC); if (!arp) goto out_unlock; if (skb_copy_bits(skb, 0, arp, alen) < 0) goto out_unlock; } if (arp->ar_hln != bond->dev->addr_len || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK || arp->ar_hrd != htons(ARPHRD_ETHER) || arp->ar_pro != htons(ETH_P_IP) || arp->ar_pln != 4) goto out_unlock; arp_ptr = (unsigned char *)(arp + 1); arp_ptr += bond->dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4 + bond->dev->addr_len; memcpy(&tip, arp_ptr, 4); slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI4 tip %pI4\n", __func__, slave->dev->name, bond_slave_state(slave), bond->params.arp_validate, slave_do_arp_validate(bond, slave), &sip, &tip); curr_active_slave = rcu_dereference(bond->curr_active_slave); curr_arp_slave = rcu_dereference(bond->current_arp_slave); /* We 'trust' the received ARP enough to validate it if: * * (a) the slave receiving the ARP is active (which includes the * current ARP slave, if any), or * * (b) the receiving slave isn't active, but there is a currently * active slave and it received valid arp reply(s) after it became * the currently active slave, or * * (c) there is an ARP slave that sent an ARP during the prior ARP * interval, and we receive an ARP reply on any slave. We accept * these because switch FDB update delays may deliver the ARP * reply to a slave other than the sender of the ARP request. * * Note: for (b), backup slaves are receiving the broadcast ARP * request, not a reply. This request passes from the sending * slave through the L2 switch(es) to the receiving slave. Since * this is checking the request, sip/tip are swapped for * validation. * * This is done to avoid endless looping when we can't reach the * arp_ip_target and fool ourselves with our own arp requests. */ if (bond_is_active_slave(slave)) bond_validate_arp(bond, slave, sip, tip); else if (curr_active_slave && time_after(slave_last_rx(bond, curr_active_slave), curr_active_slave->last_link_up)) bond_validate_arp(bond, slave, tip, sip); else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) && bond_time_in_interval(bond, dev_trans_start(curr_arp_slave->dev), 1)) bond_validate_arp(bond, slave, sip, tip); out_unlock: if (arp != (struct arphdr *)skb->data) kfree(arp); return RX_HANDLER_ANOTHER; } /* function to verify if we're in the arp_interval timeslice, returns true if * (last_act - arp_interval) <= jiffies <= (last_act + mod * arp_interval + * arp_interval/2) . the arp_interval/2 is needed for really fast networks. */ static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act, int mod) { int delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval); return time_in_range(jiffies, last_act - delta_in_ticks, last_act + mod * delta_in_ticks + delta_in_ticks/2); } /* This function is called regularly to monitor each slave's link * ensuring that traffic is being sent and received when arp monitoring * is used in load-balancing mode. if the adapter has been dormant, then an * arp is transmitted to generate traffic. see activebackup_arp_monitor for * arp monitoring in active backup mode. */ static void bond_loadbalance_arp_mon(struct bonding *bond) { struct slave *slave, *oldcurrent; struct list_head *iter; int do_failover = 0, slave_state_changed = 0; if (!bond_has_slaves(bond)) goto re_arm; rcu_read_lock(); oldcurrent = rcu_dereference(bond->curr_active_slave); /* see if any of the previous devices are up now (i.e. they have * xmt and rcv traffic). the curr_active_slave does not come into * the picture unless it is null. also, slave->last_link_up is not * needed here because we send an arp on each slave and give a slave * as long as it needs to get the tx/rx within the delta. * TODO: what about up/down delay in arp mode? it wasn't here before * so it can wait */ bond_for_each_slave_rcu(bond, slave, iter) { unsigned long trans_start = dev_trans_start(slave->dev); bond_propose_link_state(slave, BOND_LINK_NOCHANGE); if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, trans_start, 1) && bond_time_in_interval(bond, slave->last_rx, 1)) { bond_propose_link_state(slave, BOND_LINK_UP); slave_state_changed = 1; /* primary_slave has no meaning in round-robin * mode. the window of a slave being up and * curr_active_slave being null after enslaving * is closed. */ if (!oldcurrent) { slave_info(bond->dev, slave->dev, "link status definitely up\n"); do_failover = 1; } else { slave_info(bond->dev, slave->dev, "interface is now up\n"); } } } else { /* slave->link == BOND_LINK_UP */ /* not all switches will respond to an arp request * when the source ip is 0, so don't take the link down * if we don't know our ip yet */ if (!bond_time_in_interval(bond, trans_start, bond->params.missed_max) || !bond_time_in_interval(bond, slave->last_rx, bond->params.missed_max)) { bond_propose_link_state(slave, BOND_LINK_DOWN); slave_state_changed = 1; if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; slave_info(bond->dev, slave->dev, "interface is now down\n"); if (slave == oldcurrent) do_failover = 1; } } /* note: if switch is in round-robin mode, all links * must tx arp to ensure all links rx an arp - otherwise * links may oscillate or not come up at all; if switch is * in something like xor mode, there is nothing we can * do - all replies will be rx'ed on same link causing slaves * to be unstable during low/no traffic periods */ if (bond_slave_is_up(slave)) bond_arp_send_all(bond, slave); } rcu_read_unlock(); if (do_failover || slave_state_changed) { if (!rtnl_trylock()) goto re_arm; bond_for_each_slave(bond, slave, iter) { if (slave->link_new_state != BOND_LINK_NOCHANGE) slave->link = slave->link_new_state; } if (slave_state_changed) { bond_slave_state_change(bond); if (BOND_MODE(bond) == BOND_MODE_XOR) bond_update_slave_arr(bond, NULL); } if (do_failover) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } rtnl_unlock(); } re_arm: if (bond->params.arp_interval) queue_delayed_work(bond->wq, &bond->arp_work, msecs_to_jiffies(bond->params.arp_interval)); } /* Called to inspect slaves for active-backup mode ARP monitor link state * changes. Sets proposed link state in slaves to specify what action * should take place for the slave. Returns 0 if no changes are found, >0 * if changes to link states must be committed. * * Called with rcu_read_lock held. */ static int bond_ab_arp_inspect(struct bonding *bond) { unsigned long trans_start, last_rx; struct list_head *iter; struct slave *slave; int commit = 0; bond_for_each_slave_rcu(bond, slave, iter) { bond_propose_link_state(slave, BOND_LINK_NOCHANGE); last_rx = slave_last_rx(bond, slave); if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, last_rx, 1)) { bond_propose_link_state(slave, BOND_LINK_UP); commit++; } else if (slave->link == BOND_LINK_BACK) { bond_propose_link_state(slave, BOND_LINK_FAIL); commit++; } continue; } /* Give slaves 2*delta after being enslaved or made * active. This avoids bouncing, as the last receive * times need a full ARP monitor cycle to be updated. */ if (bond_time_in_interval(bond, slave->last_link_up, 2)) continue; /* Backup slave is down if: * - No current_arp_slave AND * - more than (missed_max+1)*delta since last receive AND * - the bond has an IP address * * Note: a non-null current_arp_slave indicates * the curr_active_slave went down and we are * searching for a new one; under this condition * we only take the curr_active_slave down - this * gives each slave a chance to tx/rx traffic * before being taken out */ if (!bond_is_active_slave(slave) && !rcu_access_pointer(bond->current_arp_slave) && !bond_time_in_interval(bond, last_rx, bond->params.missed_max + 1)) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; } /* Active slave is down if: * - more than missed_max*delta since transmitting OR * - (more than missed_max*delta since receive AND * the bond has an IP address) */ trans_start = dev_trans_start(slave->dev); if (bond_is_active_slave(slave) && (!bond_time_in_interval(bond, trans_start, bond->params.missed_max) || !bond_time_in_interval(bond, last_rx, bond->params.missed_max))) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; } } return commit; } /* Called to commit link state changes noted by inspection step of * active-backup mode ARP monitor. * * Called with RTNL hold. */ static void bond_ab_arp_commit(struct bonding *bond) { unsigned long trans_start; struct list_head *iter; struct slave *slave; bond_for_each_slave(bond, slave, iter) { switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: continue; case BOND_LINK_UP: trans_start = dev_trans_start(slave->dev); if (rtnl_dereference(bond->curr_active_slave) != slave || (!rtnl_dereference(bond->curr_active_slave) && bond_time_in_interval(bond, trans_start, 1))) { struct slave *current_arp_slave; current_arp_slave = rtnl_dereference(bond->current_arp_slave); bond_set_slave_link_state(slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); if (current_arp_slave) { bond_set_slave_inactive_flags( current_arp_slave, BOND_SLAVE_NOTIFY_NOW); RCU_INIT_POINTER(bond->current_arp_slave, NULL); } slave_info(bond->dev, slave->dev, "link status definitely up\n"); if (!rtnl_dereference(bond->curr_active_slave) || slave == rtnl_dereference(bond->primary_slave)) goto do_failover; } continue; case BOND_LINK_DOWN: if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; bond_set_slave_link_state(slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_NOW); bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n"); if (slave == rtnl_dereference(bond->curr_active_slave)) { RCU_INIT_POINTER(bond->current_arp_slave, NULL); goto do_failover; } continue; case BOND_LINK_FAIL: bond_set_slave_link_state(slave, BOND_LINK_FAIL, BOND_SLAVE_NOTIFY_NOW); bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); /* A slave has just been enslaved and has become * the current active slave. */ if (rtnl_dereference(bond->curr_active_slave)) RCU_INIT_POINTER(bond->current_arp_slave, NULL); continue; default: slave_err(bond->dev, slave->dev, "impossible: link_new_state %d on slave\n", slave->link_new_state); continue; } do_failover: block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } bond_set_carrier(bond); } /* Send ARP probes for active-backup mode ARP monitor. * * Called with rcu_read_lock held. */ static bool bond_ab_arp_probe(struct bonding *bond) { struct slave *slave, *before = NULL, *new_slave = NULL, *curr_arp_slave = rcu_dereference(bond->current_arp_slave), *curr_active_slave = rcu_dereference(bond->curr_active_slave); struct list_head *iter; bool found = false; bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER; if (curr_arp_slave && curr_active_slave) netdev_info(bond->dev, "PROBE: c_arp %s && cas %s BAD\n", curr_arp_slave->dev->name, curr_active_slave->dev->name); if (curr_active_slave) { bond_arp_send_all(bond, curr_active_slave); return should_notify_rtnl; } /* if we don't have a curr_active_slave, search for the next available * backup slave from the current_arp_slave and make it the candidate * for becoming the curr_active_slave */ if (!curr_arp_slave) { curr_arp_slave = bond_first_slave_rcu(bond); if (!curr_arp_slave) return should_notify_rtnl; } bond_for_each_slave_rcu(bond, slave, iter) { if (!found && !before && bond_slave_is_up(slave)) before = slave; if (found && !new_slave && bond_slave_is_up(slave)) new_slave = slave; /* if the link state is up at this point, we * mark it down - this can happen if we have * simultaneous link failures and * reselect_active_interface doesn't make this * one the current slave so it is still marked * up when it is actually down */ if (!bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) { bond_set_slave_link_state(slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_LATER); if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_LATER); slave_info(bond->dev, slave->dev, "backup interface is now down\n"); } if (slave == curr_arp_slave) found = true; } if (!new_slave && before) new_slave = before; if (!new_slave) goto check_state; bond_set_slave_link_state(new_slave, BOND_LINK_BACK, BOND_SLAVE_NOTIFY_LATER); bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_LATER); bond_arp_send_all(bond, new_slave); new_slave->last_link_up = jiffies; rcu_assign_pointer(bond->current_arp_slave, new_slave); check_state: bond_for_each_slave_rcu(bond, slave, iter) { if (slave->should_notify || slave->should_notify_link) { should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW; break; } } return should_notify_rtnl; } static void bond_activebackup_arp_mon(struct bonding *bond) { bool should_notify_peers = false; bool should_notify_rtnl = false; int delta_in_ticks; delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval); if (!bond_has_slaves(bond)) goto re_arm; rcu_read_lock(); should_notify_peers = bond_should_notify_peers(bond); if (bond_ab_arp_inspect(bond)) { rcu_read_unlock(); /* Race avoidance with bond_close flush of workqueue */ if (!rtnl_trylock()) { delta_in_ticks = 1; should_notify_peers = false; goto re_arm; } bond_ab_arp_commit(bond); rtnl_unlock(); rcu_read_lock(); } should_notify_rtnl = bond_ab_arp_probe(bond); rcu_read_unlock(); re_arm: if (bond->params.arp_interval) queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks); if (should_notify_peers || should_notify_rtnl) { if (!rtnl_trylock()) return; if (should_notify_peers) { bond->send_peer_notif--; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); } if (should_notify_rtnl) { bond_slave_state_notify(bond); bond_slave_link_notify(bond); } rtnl_unlock(); } } static void bond_arp_monitor(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, arp_work.work); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) bond_activebackup_arp_mon(bond); else bond_loadbalance_arp_mon(bond); } /*-------------------------- netdev event handling --------------------------*/ /* Change device name */ static int bond_event_changename(struct bonding *bond) { bond_remove_proc_entry(bond); bond_create_proc_entry(bond); bond_debug_reregister(bond); return NOTIFY_DONE; } static int bond_master_netdev_event(unsigned long event, struct net_device *bond_dev) { struct bonding *event_bond = netdev_priv(bond_dev); netdev_dbg(bond_dev, "%s called\n", __func__); switch (event) { case NETDEV_CHANGENAME: return bond_event_changename(event_bond); case NETDEV_UNREGISTER: bond_remove_proc_entry(event_bond); #ifdef CONFIG_XFRM_OFFLOAD xfrm_dev_state_flush(dev_net(bond_dev), bond_dev, true); #endif /* CONFIG_XFRM_OFFLOAD */ break; case NETDEV_REGISTER: bond_create_proc_entry(event_bond); break; default: break; } return NOTIFY_DONE; } static int bond_slave_netdev_event(unsigned long event, struct net_device *slave_dev) { struct slave *slave = bond_slave_get_rtnl(slave_dev), *primary; struct bonding *bond; struct net_device *bond_dev; /* A netdev event can be generated while enslaving a device * before netdev_rx_handler_register is called in which case * slave will be NULL */ if (!slave) { netdev_dbg(slave_dev, "%s called on NULL slave\n", __func__); return NOTIFY_DONE; } bond_dev = slave->bond->dev; bond = slave->bond; primary = rtnl_dereference(bond->primary_slave); slave_dbg(bond_dev, slave_dev, "%s called\n", __func__); switch (event) { case NETDEV_UNREGISTER: if (bond_dev->type != ARPHRD_ETHER) bond_release_and_destroy(bond_dev, slave_dev); else __bond_release_one(bond_dev, slave_dev, false, true); break; case NETDEV_UP: case NETDEV_CHANGE: /* For 802.3ad mode only: * Getting invalid Speed/Duplex values here will put slave * in weird state. Mark it as link-fail if the link was * previously up or link-down if it hasn't yet come up, and * let link-monitoring (miimon) set it right when correct * speeds/duplex are available. */ if (bond_update_speed_duplex(slave) && BOND_MODE(bond) == BOND_MODE_8023AD) { if (slave->last_link_up) slave->link = BOND_LINK_FAIL; else slave->link = BOND_LINK_DOWN; } if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_adapter_speed_duplex_changed(slave); fallthrough; case NETDEV_DOWN: /* Refresh slave-array if applicable! * If the setup does not use miimon or arpmon (mode-specific!), * then these events will not cause the slave-array to be * refreshed. This will cause xmit to use a slave that is not * usable. Avoid such situation by refeshing the array at these * events. If these (miimon/arpmon) parameters are configured * then array gets refreshed twice and that should be fine! */ if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); break; case NETDEV_CHANGEMTU: /* TODO: Should slaves be allowed to * independently alter their MTU? For * an active-backup bond, slaves need * not be the same type of device, so * MTUs may vary. For other modes, * slaves arguably should have the * same MTUs. To do this, we'd need to * take over the slave's change_mtu * function for the duration of their * servitude. */ break; case NETDEV_CHANGENAME: /* we don't care if we don't have primary set */ if (!bond_uses_primary(bond) || !bond->params.primary[0]) break; if (slave == primary) { /* slave's name changed - he's no longer primary */ RCU_INIT_POINTER(bond->primary_slave, NULL); } else if (!strcmp(slave_dev->name, bond->params.primary)) { /* we have a new primary slave */ rcu_assign_pointer(bond->primary_slave, slave); } else { /* we didn't change primary - exit */ break; } netdev_info(bond->dev, "Primary slave changed to %s, reselecting active slave\n", primary ? slave_dev->name : "none"); block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); break; case NETDEV_FEAT_CHANGE: if (!bond->notifier_ctx) { bond->notifier_ctx = true; bond_compute_features(bond); bond->notifier_ctx = false; } break; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, slave->bond->dev); break; default: break; } return NOTIFY_DONE; } /* bond_netdev_event: handle netdev notifier chain events. * * This function receives events for the netdev chain. The caller (an * ioctl handler calling blocking_notifier_call_chain) holds the necessary * locks for us to safely manipulate the slave devices (RTNL lock, * dev_probe_lock). */ static int bond_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); netdev_dbg(event_dev, "%s received %s\n", __func__, netdev_cmd_to_name(event)); if (!(event_dev->priv_flags & IFF_BONDING)) return NOTIFY_DONE; if (event_dev->flags & IFF_MASTER) { int ret; ret = bond_master_netdev_event(event, event_dev); if (ret != NOTIFY_DONE) return ret; } if (event_dev->flags & IFF_SLAVE) return bond_slave_netdev_event(event, event_dev); return NOTIFY_DONE; } static struct notifier_block bond_netdev_notifier = { .notifier_call = bond_netdev_event, }; /*---------------------------- Hashing Policies -----------------------------*/ /* Helper to access data in a packet, with or without a backing skb. * If skb is given the data is linearized if necessary via pskb_may_pull. */ static inline const void *bond_pull_data(struct sk_buff *skb, const void *data, int hlen, int n) { if (likely(n <= hlen)) return data; else if (skb && likely(pskb_may_pull(skb, n))) return skb->data; return NULL; } /* L2 hash helper */ static inline u32 bond_eth_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen) { struct ethhdr *ep; data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr)); if (!data) return 0; ep = (struct ethhdr *)(data + mhoff); return ep->h_dest[5] ^ ep->h_source[5] ^ be16_to_cpu(ep->h_proto); } static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, const void *data, int hlen, __be16 l2_proto, int *nhoff, int *ip_proto, bool l34) { const struct ipv6hdr *iph6; const struct iphdr *iph; if (l2_proto == htons(ETH_P_IP)) { data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph)); if (!data) return false; iph = (const struct iphdr *)(data + *nhoff); iph_to_flow_copy_v4addrs(fk, iph); *nhoff += iph->ihl << 2; if (!ip_is_fragment(iph)) *ip_proto = iph->protocol; } else if (l2_proto == htons(ETH_P_IPV6)) { data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph6)); if (!data) return false; iph6 = (const struct ipv6hdr *)(data + *nhoff); iph_to_flow_copy_v6addrs(fk, iph6); *nhoff += sizeof(*iph6); *ip_proto = iph6->nexthdr; } else { return false; } if (l34 && *ip_proto >= 0) fk->ports.ports = __skb_flow_get_ports(skb, *nhoff, *ip_proto, data, hlen); return true; } static u32 bond_vlan_srcmac_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen) { u32 srcmac_vendor = 0, srcmac_dev = 0; struct ethhdr *mac_hdr; u16 vlan = 0; int i; data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr)); if (!data) return 0; mac_hdr = (struct ethhdr *)(data + mhoff); for (i = 0; i < 3; i++) srcmac_vendor = (srcmac_vendor << 8) | mac_hdr->h_source[i]; for (i = 3; i < ETH_ALEN; i++) srcmac_dev = (srcmac_dev << 8) | mac_hdr->h_source[i]; if (skb && skb_vlan_tag_present(skb)) vlan = skb_vlan_tag_get(skb); return vlan ^ srcmac_vendor ^ srcmac_dev; } /* Extract the appropriate headers based on bond's xmit policy */ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, const void *data, __be16 l2_proto, int nhoff, int hlen, struct flow_keys *fk) { bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34; int ip_proto = -1; switch (bond->params.xmit_policy) { case BOND_XMIT_POLICY_ENCAP23: case BOND_XMIT_POLICY_ENCAP34: memset(fk, 0, sizeof(*fk)); return __skb_flow_dissect(NULL, skb, &flow_keys_bonding, fk, data, l2_proto, nhoff, hlen, 0); default: break; } fk->ports.ports = 0; memset(&fk->icmp, 0, sizeof(fk->icmp)); if (!bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34)) return false; /* ICMP error packets contains at least 8 bytes of the header * of the packet which generated the error. Use this information * to correlate ICMP error packets within the same flow which * generated the error. */ if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { skb_flow_get_icmp_tci(skb, &fk->icmp, data, nhoff, hlen); if (ip_proto == IPPROTO_ICMP) { if (!icmp_is_err(fk->icmp.type)) return true; nhoff += sizeof(struct icmphdr); } else if (ip_proto == IPPROTO_ICMPV6) { if (!icmpv6_is_err(fk->icmp.type)) return true; nhoff += sizeof(struct icmp6hdr); } return bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34); } return true; } static u32 bond_ip_hash(u32 hash, struct flow_keys *flow, int xmit_policy) { hash ^= (__force u32)flow_get_u32_dst(flow) ^ (__force u32)flow_get_u32_src(flow); hash ^= (hash >> 16); hash ^= (hash >> 8); /* discard lowest hash bit to deal with the common even ports pattern */ if (xmit_policy == BOND_XMIT_POLICY_LAYER34 || xmit_policy == BOND_XMIT_POLICY_ENCAP34) return hash >> 1; return hash; } /* Generate hash based on xmit policy. If @skb is given it is used to linearize * the data as required, but this function can be used without it if the data is * known to be linear (e.g. with xdp_buff). */ static u32 __bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, const void *data, __be16 l2_proto, int mhoff, int nhoff, int hlen) { struct flow_keys flow; u32 hash; if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC) return bond_vlan_srcmac_hash(skb, data, mhoff, hlen); if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 || !bond_flow_dissect(bond, skb, data, l2_proto, nhoff, hlen, &flow)) return bond_eth_hash(skb, data, mhoff, hlen); if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 || bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) { hash = bond_eth_hash(skb, data, mhoff, hlen); } else { if (flow.icmp.id) memcpy(&hash, &flow.icmp, sizeof(hash)); else memcpy(&hash, &flow.ports.ports, sizeof(hash)); } return bond_ip_hash(hash, &flow, bond->params.xmit_policy); } /** * bond_xmit_hash - generate a hash value based on the xmit policy * @bond: bonding device * @skb: buffer to use for headers * * This function will extract the necessary headers from the skb buffer and use * them to generate a hash based on the xmit_policy set in the bonding device */ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) { if (bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP34 && skb->l4_hash) return skb->hash; return __bond_xmit_hash(bond, skb, skb->data, skb->protocol, 0, skb_network_offset(skb), skb_headlen(skb)); } /** * bond_xmit_hash_xdp - generate a hash value based on the xmit policy * @bond: bonding device * @xdp: buffer to use for headers * * The XDP variant of bond_xmit_hash. */ static u32 bond_xmit_hash_xdp(struct bonding *bond, struct xdp_buff *xdp) { struct ethhdr *eth; if (xdp->data + sizeof(struct ethhdr) > xdp->data_end) return 0; eth = (struct ethhdr *)xdp->data; return __bond_xmit_hash(bond, NULL, xdp->data, eth->h_proto, 0, sizeof(struct ethhdr), xdp->data_end - xdp->data); } /*-------------------------- Device entry points ----------------------------*/ void bond_work_init_all(struct bonding *bond) { INIT_DELAYED_WORK(&bond->mcast_work, bond_resend_igmp_join_requests_delayed); INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor); INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor); INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor); INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler); INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler); } static void bond_work_cancel_all(struct bonding *bond) { cancel_delayed_work_sync(&bond->mii_work); cancel_delayed_work_sync(&bond->arp_work); cancel_delayed_work_sync(&bond->alb_work); cancel_delayed_work_sync(&bond->ad_work); cancel_delayed_work_sync(&bond->mcast_work); cancel_delayed_work_sync(&bond->slave_arr_work); } static int bond_open(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; if (BOND_MODE(bond) == BOND_MODE_ROUNDROBIN && !bond->rr_tx_counter) { bond->rr_tx_counter = alloc_percpu(u32); if (!bond->rr_tx_counter) return -ENOMEM; } /* reset slave->backup and slave->inactive */ if (bond_has_slaves(bond)) { bond_for_each_slave(bond, slave, iter) { if (bond_uses_primary(bond) && slave != rcu_access_pointer(bond->curr_active_slave)) { bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); } else if (BOND_MODE(bond) != BOND_MODE_8023AD) { bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_NOW); } } } if (bond_is_lb(bond)) { /* bond_alb_initialize must be called before the timer * is started. */ if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB))) return -ENOMEM; if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == BOND_MODE_ALB) queue_delayed_work(bond->wq, &bond->alb_work, 0); } if (bond->params.miimon) /* link check interval, in milliseconds. */ queue_delayed_work(bond->wq, &bond->mii_work, 0); if (bond->params.arp_interval) { /* arp interval, in milliseconds. */ queue_delayed_work(bond->wq, &bond->arp_work, 0); bond->recv_probe = bond_arp_rcv; } if (BOND_MODE(bond) == BOND_MODE_8023AD) { queue_delayed_work(bond->wq, &bond->ad_work, 0); /* register to receive LACPDUs */ bond->recv_probe = bond_3ad_lacpdu_recv; bond_3ad_initiate_agg_selection(bond, 1); bond_for_each_slave(bond, slave, iter) dev_mc_add(slave->dev, lacpdu_mcast_addr); } if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); return 0; } static int bond_close(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; bond_work_cancel_all(bond); bond->send_peer_notif = 0; if (bond_is_lb(bond)) bond_alb_deinitialize(bond); bond->recv_probe = NULL; if (bond_uses_primary(bond)) { rcu_read_lock(); slave = rcu_dereference(bond->curr_active_slave); if (slave) bond_hw_addr_flush(bond_dev, slave->dev); rcu_read_unlock(); } else { struct list_head *iter; bond_for_each_slave(bond, slave, iter) bond_hw_addr_flush(bond_dev, slave->dev); } return 0; } /* fold stats, assuming all rtnl_link_stats64 fields are u64, but * that some drivers can provide 32bit values only. */ static void bond_fold_stats(struct rtnl_link_stats64 *_res, const struct rtnl_link_stats64 *_new, const struct rtnl_link_stats64 *_old) { const u64 *new = (const u64 *)_new; const u64 *old = (const u64 *)_old; u64 *res = (u64 *)_res; int i; for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) { u64 nv = new[i]; u64 ov = old[i]; s64 delta = nv - ov; /* detects if this particular field is 32bit only */ if (((nv | ov) >> 32) == 0) delta = (s64)(s32)((u32)nv - (u32)ov); /* filter anomalies, some drivers reset their stats * at down/up events. */ if (delta > 0) res[i] += delta; } } #ifdef CONFIG_LOCKDEP static int bond_get_lowest_level_rcu(struct net_device *dev) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int cur = 0, max = 0; now = dev; iter = &dev->adj_list.lower; while (1) { next = NULL; while (1) { ldev = netdev_next_lower_dev_rcu(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; if (max <= cur) max = cur; break; } if (!next) { if (!cur) return max; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return max; } #endif static void bond_get_stats(struct net_device *bond_dev, struct rtnl_link_stats64 *stats) { struct bonding *bond = netdev_priv(bond_dev); struct rtnl_link_stats64 temp; struct list_head *iter; struct slave *slave; int nest_level = 0; rcu_read_lock(); #ifdef CONFIG_LOCKDEP nest_level = bond_get_lowest_level_rcu(bond_dev); #endif spin_lock_nested(&bond->stats_lock, nest_level); memcpy(stats, &bond->bond_stats, sizeof(*stats)); bond_for_each_slave_rcu(bond, slave, iter) { const struct rtnl_link_stats64 *new = dev_get_stats(slave->dev, &temp); bond_fold_stats(stats, new, &slave->slave_stats); /* save off the slave stats for the next run */ memcpy(&slave->slave_stats, new, sizeof(*new)); } memcpy(&bond->bond_stats, stats, sizeof(*stats)); spin_unlock(&bond->stats_lock); rcu_read_unlock(); } static int bond_eth_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd) { struct bonding *bond = netdev_priv(bond_dev); struct mii_ioctl_data *mii = NULL; int res; netdev_dbg(bond_dev, "bond_eth_ioctl: cmd=%d\n", cmd); switch (cmd) { case SIOCGMIIPHY: mii = if_mii(ifr); if (!mii) return -EINVAL; mii->phy_id = 0; fallthrough; case SIOCGMIIREG: /* We do this again just in case we were called by SIOCGMIIREG * instead of SIOCGMIIPHY. */ mii = if_mii(ifr); if (!mii) return -EINVAL; if (mii->reg_num == 1) { mii->val_out = 0; if (netif_carrier_ok(bond->dev)) mii->val_out = BMSR_LSTATUS; } return 0; default: res = -EOPNOTSUPP; } return res; } static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd) { struct bonding *bond = netdev_priv(bond_dev); struct net_device *slave_dev = NULL; struct ifbond k_binfo; struct ifbond __user *u_binfo = NULL; struct ifslave k_sinfo; struct ifslave __user *u_sinfo = NULL; struct bond_opt_value newval; struct net *net; int res = 0; netdev_dbg(bond_dev, "bond_ioctl: cmd=%d\n", cmd); switch (cmd) { case SIOCBONDINFOQUERY: u_binfo = (struct ifbond __user *)ifr->ifr_data; if (copy_from_user(&k_binfo, u_binfo, sizeof(ifbond))) return -EFAULT; bond_info_query(bond_dev, &k_binfo); if (copy_to_user(u_binfo, &k_binfo, sizeof(ifbond))) return -EFAULT; return 0; case SIOCBONDSLAVEINFOQUERY: u_sinfo = (struct ifslave __user *)ifr->ifr_data; if (copy_from_user(&k_sinfo, u_sinfo, sizeof(ifslave))) return -EFAULT; res = bond_slave_info_query(bond_dev, &k_sinfo); if (res == 0 && copy_to_user(u_sinfo, &k_sinfo, sizeof(ifslave))) return -EFAULT; return res; default: break; } net = dev_net(bond_dev); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; slave_dev = __dev_get_by_name(net, ifr->ifr_slave); slave_dbg(bond_dev, slave_dev, "slave_dev=%p:\n", slave_dev); if (!slave_dev) return -ENODEV; switch (cmd) { case SIOCBONDENSLAVE: res = bond_enslave(bond_dev, slave_dev, NULL); break; case SIOCBONDRELEASE: res = bond_release(bond_dev, slave_dev); break; case SIOCBONDSETHWADDR: res = bond_set_dev_addr(bond_dev, slave_dev); break; case SIOCBONDCHANGEACTIVE: bond_opt_initstr(&newval, slave_dev->name); res = __bond_opt_set_notify(bond, BOND_OPT_ACTIVE_SLAVE, &newval); break; default: res = -EOPNOTSUPP; } return res; } static int bond_siocdevprivate(struct net_device *bond_dev, struct ifreq *ifr, void __user *data, int cmd) { struct ifreq ifrdata = { .ifr_data = data }; switch (cmd) { case BOND_INFO_QUERY_OLD: return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDINFOQUERY); case BOND_SLAVE_INFO_QUERY_OLD: return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDSLAVEINFOQUERY); case BOND_ENSLAVE_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDENSLAVE); case BOND_RELEASE_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDRELEASE); case BOND_SETHWADDR_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDSETHWADDR); case BOND_CHANGE_ACTIVE_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDCHANGEACTIVE); } return -EOPNOTSUPP; } static void bond_change_rx_flags(struct net_device *bond_dev, int change) { struct bonding *bond = netdev_priv(bond_dev); if (change & IFF_PROMISC) bond_set_promiscuity(bond, bond_dev->flags & IFF_PROMISC ? 1 : -1); if (change & IFF_ALLMULTI) bond_set_allmulti(bond, bond_dev->flags & IFF_ALLMULTI ? 1 : -1); } static void bond_set_rx_mode(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; rcu_read_lock(); if (bond_uses_primary(bond)) { slave = rcu_dereference(bond->curr_active_slave); if (slave) { dev_uc_sync(slave->dev, bond_dev); dev_mc_sync(slave->dev, bond_dev); } } else { bond_for_each_slave_rcu(bond, slave, iter) { dev_uc_sync_multiple(slave->dev, bond_dev); dev_mc_sync_multiple(slave->dev, bond_dev); } } rcu_read_unlock(); } static int bond_neigh_init(struct neighbour *n) { struct bonding *bond = netdev_priv(n->dev); const struct net_device_ops *slave_ops; struct neigh_parms parms; struct slave *slave; int ret = 0; rcu_read_lock(); slave = bond_first_slave_rcu(bond); if (!slave) goto out; slave_ops = slave->dev->netdev_ops; if (!slave_ops->ndo_neigh_setup) goto out; /* TODO: find another way [1] to implement this. * Passing a zeroed structure is fragile, * but at least we do not pass garbage. * * [1] One way would be that ndo_neigh_setup() never touch * struct neigh_parms, but propagate the new neigh_setup() * back to ___neigh_create() / neigh_parms_alloc() */ memset(&parms, 0, sizeof(parms)); ret = slave_ops->ndo_neigh_setup(slave->dev, &parms); if (ret) goto out; if (parms.neigh_setup) ret = parms.neigh_setup(n); out: rcu_read_unlock(); return ret; } /* The bonding ndo_neigh_setup is called at init time beofre any * slave exists. So we must declare proxy setup function which will * be used at run time to resolve the actual slave neigh param setup. * * It's also called by master devices (such as vlans) to setup their * underlying devices. In that case - do nothing, we're already set up from * our init. */ static int bond_neigh_setup(struct net_device *dev, struct neigh_parms *parms) { /* modify only our neigh_parms */ if (parms->dev == dev) parms->neigh_setup = bond_neigh_init; return 0; } /* Change the MTU of all of a master's slaves to match the master */ static int bond_change_mtu(struct net_device *bond_dev, int new_mtu) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; struct list_head *iter; int res = 0; netdev_dbg(bond_dev, "bond=%p, new_mtu=%d\n", bond, new_mtu); bond_for_each_slave(bond, slave, iter) { slave_dbg(bond_dev, slave->dev, "s %p c_m %p\n", slave, slave->dev->netdev_ops->ndo_change_mtu); res = dev_set_mtu(slave->dev, new_mtu); if (res) { /* If we failed to set the slave's mtu to the new value * we must abort the operation even in ACTIVE_BACKUP * mode, because if we allow the backup slaves to have * different mtu values than the active slave we'll * need to change their mtu when doing a failover. That * means changing their mtu from timer context, which * is probably not a good idea. */ slave_dbg(bond_dev, slave->dev, "err %d setting mtu to %d\n", res, new_mtu); goto unwind; } } bond_dev->mtu = new_mtu; return 0; unwind: /* unwind from head to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { int tmp_res; if (rollback_slave == slave) break; tmp_res = dev_set_mtu(rollback_slave->dev, bond_dev->mtu); if (tmp_res) slave_dbg(bond_dev, rollback_slave->dev, "unwind err %d\n", tmp_res); } return res; } /* Change HW address * * Note that many devices must be down to change the HW address, and * downing the master releases all slaves. We can make bonds full of * bonding devices to test this, however. */ static int bond_set_mac_address(struct net_device *bond_dev, void *addr) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; struct sockaddr_storage *ss = addr, tmp_ss; struct list_head *iter; int res = 0; if (BOND_MODE(bond) == BOND_MODE_ALB) return bond_alb_set_mac_address(bond_dev, addr); netdev_dbg(bond_dev, "%s: bond=%p\n", __func__, bond); /* If fail_over_mac is enabled, do nothing and return success. * Returning an error causes ifenslave to fail. */ if (bond->params.fail_over_mac && BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) return 0; if (!is_valid_ether_addr(ss->__data)) return -EADDRNOTAVAIL; bond_for_each_slave(bond, slave, iter) { slave_dbg(bond_dev, slave->dev, "%s: slave=%p\n", __func__, slave); res = dev_set_mac_address(slave->dev, addr, NULL); if (res) { /* TODO: consider downing the slave * and retry ? * User should expect communications * breakage anyway until ARP finish * updating, so... */ slave_dbg(bond_dev, slave->dev, "%s: err %d\n", __func__, res); goto unwind; } } /* success */ memcpy(bond_dev->dev_addr, ss->__data, bond_dev->addr_len); return 0; unwind: memcpy(tmp_ss.__data, bond_dev->dev_addr, bond_dev->addr_len); tmp_ss.ss_family = bond_dev->type; /* unwind from head to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { int tmp_res; if (rollback_slave == slave) break; tmp_res = dev_set_mac_address(rollback_slave->dev, (struct sockaddr *)&tmp_ss, NULL); if (tmp_res) { slave_dbg(bond_dev, rollback_slave->dev, "%s: unwind err %d\n", __func__, tmp_res); } } return res; } /** * bond_get_slave_by_id - get xmit slave with slave_id * @bond: bonding device that is transmitting * @slave_id: slave id up to slave_cnt-1 through which to transmit * * This function tries to get slave with slave_id but in case * it fails, it tries to find the first available slave for transmission. */ static struct slave *bond_get_slave_by_id(struct bonding *bond, int slave_id) { struct list_head *iter; struct slave *slave; int i = slave_id; /* Here we start from the slave with slave_id */ bond_for_each_slave_rcu(bond, slave, iter) { if (--i < 0) { if (bond_slave_can_tx(slave)) return slave; } } /* Here we start from the first slave up to slave_id */ i = slave_id; bond_for_each_slave_rcu(bond, slave, iter) { if (--i < 0) break; if (bond_slave_can_tx(slave)) return slave; } /* no slave that can tx has been found */ return NULL; } /** * bond_rr_gen_slave_id - generate slave id based on packets_per_slave * @bond: bonding device to use * * Based on the value of the bonding device's packets_per_slave parameter * this function generates a slave id, which is usually used as the next * slave to transmit through. */ static u32 bond_rr_gen_slave_id(struct bonding *bond) { u32 slave_id; struct reciprocal_value reciprocal_packets_per_slave; int packets_per_slave = bond->params.packets_per_slave; switch (packets_per_slave) { case 0: slave_id = prandom_u32(); break; case 1: slave_id = this_cpu_inc_return(*bond->rr_tx_counter); break; default: reciprocal_packets_per_slave = bond->params.reciprocal_packets_per_slave; slave_id = this_cpu_inc_return(*bond->rr_tx_counter); slave_id = reciprocal_divide(slave_id, reciprocal_packets_per_slave); break; } return slave_id; } static struct slave *bond_xmit_roundrobin_slave_get(struct bonding *bond, struct sk_buff *skb) { struct slave *slave; int slave_cnt; u32 slave_id; /* Start with the curr_active_slave that joined the bond as the * default for sending IGMP traffic. For failover purposes one * needs to maintain some consistency for the interface that will * send the join/membership reports. The curr_active_slave found * will send all of this type of traffic. */ if (skb->protocol == htons(ETH_P_IP)) { int noff = skb_network_offset(skb); struct iphdr *iph; if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph)))) goto non_igmp; iph = ip_hdr(skb); if (iph->protocol == IPPROTO_IGMP) { slave = rcu_dereference(bond->curr_active_slave); if (slave) return slave; return bond_get_slave_by_id(bond, 0); } } non_igmp: slave_cnt = READ_ONCE(bond->slave_cnt); if (likely(slave_cnt)) { slave_id = bond_rr_gen_slave_id(bond) % slave_cnt; return bond_get_slave_by_id(bond, slave_id); } return NULL; } static struct slave *bond_xdp_xmit_roundrobin_slave_get(struct bonding *bond, struct xdp_buff *xdp) { struct slave *slave; int slave_cnt; u32 slave_id; const struct ethhdr *eth; void *data = xdp->data; if (data + sizeof(struct ethhdr) > xdp->data_end) goto non_igmp; eth = (struct ethhdr *)data; data += sizeof(struct ethhdr); /* See comment on IGMP in bond_xmit_roundrobin_slave_get() */ if (eth->h_proto == htons(ETH_P_IP)) { const struct iphdr *iph; if (data + sizeof(struct iphdr) > xdp->data_end) goto non_igmp; iph = (struct iphdr *)data; if (iph->protocol == IPPROTO_IGMP) { slave = rcu_dereference(bond->curr_active_slave); if (slave) return slave; return bond_get_slave_by_id(bond, 0); } } non_igmp: slave_cnt = READ_ONCE(bond->slave_cnt); if (likely(slave_cnt)) { slave_id = bond_rr_gen_slave_id(bond) % slave_cnt; return bond_get_slave_by_id(bond, slave_id); } return NULL; } static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; slave = bond_xmit_roundrobin_slave_get(bond, skb); if (likely(slave)) return bond_dev_queue_xmit(bond, skb, slave->dev); return bond_tx_drop(bond_dev, skb); } static struct slave *bond_xmit_activebackup_slave_get(struct bonding *bond) { return rcu_dereference(bond->curr_active_slave); } /* In active-backup mode, we know that bond->curr_active_slave is always valid if * the bond has a usable interface. */ static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; slave = bond_xmit_activebackup_slave_get(bond); if (slave) return bond_dev_queue_xmit(bond, skb, slave->dev); return bond_tx_drop(bond_dev, skb); } /* Use this to update slave_array when (a) it's not appropriate to update * slave_array right away (note that update_slave_array() may sleep) * and / or (b) RTNL is not held. */ void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay) { queue_delayed_work(bond->wq, &bond->slave_arr_work, delay); } /* Slave array work handler. Holds only RTNL */ static void bond_slave_arr_handler(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, slave_arr_work.work); int ret; if (!rtnl_trylock()) goto err; ret = bond_update_slave_arr(bond, NULL); rtnl_unlock(); if (ret) { pr_warn_ratelimited("Failed to update slave array from WT\n"); goto err; } return; err: bond_slave_arr_work_rearm(bond, 1); } static void bond_skip_slave(struct bond_up_slave *slaves, struct slave *skipslave) { int idx; /* Rare situation where caller has asked to skip a specific * slave but allocation failed (most likely!). BTW this is * only possible when the call is initiated from * __bond_release_one(). In this situation; overwrite the * skipslave entry in the array with the last entry from the * array to avoid a situation where the xmit path may choose * this to-be-skipped slave to send a packet out. */ for (idx = 0; slaves && idx < slaves->count; idx++) { if (skipslave == slaves->arr[idx]) { slaves->arr[idx] = slaves->arr[slaves->count - 1]; slaves->count--; break; } } } static void bond_set_slave_arr(struct bonding *bond, struct bond_up_slave *usable_slaves, struct bond_up_slave *all_slaves) { struct bond_up_slave *usable, *all; usable = rtnl_dereference(bond->usable_slaves); rcu_assign_pointer(bond->usable_slaves, usable_slaves); kfree_rcu(usable, rcu); all = rtnl_dereference(bond->all_slaves); rcu_assign_pointer(bond->all_slaves, all_slaves); kfree_rcu(all, rcu); } static void bond_reset_slave_arr(struct bonding *bond) { struct bond_up_slave *usable, *all; usable = rtnl_dereference(bond->usable_slaves); if (usable) { RCU_INIT_POINTER(bond->usable_slaves, NULL); kfree_rcu(usable, rcu); } all = rtnl_dereference(bond->all_slaves); if (all) { RCU_INIT_POINTER(bond->all_slaves, NULL); kfree_rcu(all, rcu); } } /* Build the usable slaves array in control path for modes that use xmit-hash * to determine the slave interface - * (a) BOND_MODE_8023AD * (b) BOND_MODE_XOR * (c) (BOND_MODE_TLB || BOND_MODE_ALB) && tlb_dynamic_lb == 0 * * The caller is expected to hold RTNL only and NO other lock! */ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) { struct bond_up_slave *usable_slaves = NULL, *all_slaves = NULL; struct slave *slave; struct list_head *iter; int agg_id = 0; int ret = 0; might_sleep(); usable_slaves = kzalloc(struct_size(usable_slaves, arr, bond->slave_cnt), GFP_KERNEL); all_slaves = kzalloc(struct_size(all_slaves, arr, bond->slave_cnt), GFP_KERNEL); if (!usable_slaves || !all_slaves) { ret = -ENOMEM; goto out; } if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; spin_lock_bh(&bond->mode_lock); if (bond_3ad_get_active_agg_info(bond, &ad_info)) { spin_unlock_bh(&bond->mode_lock); pr_debug("bond_3ad_get_active_agg_info failed\n"); /* No active aggragator means it's not safe to use * the previous array. */ bond_reset_slave_arr(bond); goto out; } spin_unlock_bh(&bond->mode_lock); agg_id = ad_info.aggregator_id; } bond_for_each_slave(bond, slave, iter) { if (skipslave == slave) continue; all_slaves->arr[all_slaves->count++] = slave; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct aggregator *agg; agg = SLAVE_AD_INFO(slave)->port.aggregator; if (!agg || agg->aggregator_identifier != agg_id) continue; } if (!bond_slave_can_tx(slave)) continue; slave_dbg(bond->dev, slave->dev, "Adding slave to tx hash array[%d]\n", usable_slaves->count); usable_slaves->arr[usable_slaves->count++] = slave; } bond_set_slave_arr(bond, usable_slaves, all_slaves); return ret; out: if (ret != 0 && skipslave) { bond_skip_slave(rtnl_dereference(bond->all_slaves), skipslave); bond_skip_slave(rtnl_dereference(bond->usable_slaves), skipslave); } kfree_rcu(all_slaves, rcu); kfree_rcu(usable_slaves, rcu); return ret; } static struct slave *bond_xmit_3ad_xor_slave_get(struct bonding *bond, struct sk_buff *skb, struct bond_up_slave *slaves) { struct slave *slave; unsigned int count; u32 hash; hash = bond_xmit_hash(bond, skb); count = slaves ? READ_ONCE(slaves->count) : 0; if (unlikely(!count)) return NULL; slave = slaves->arr[hash % count]; return slave; } static struct slave *bond_xdp_xmit_3ad_xor_slave_get(struct bonding *bond, struct xdp_buff *xdp) { struct bond_up_slave *slaves; unsigned int count; u32 hash; hash = bond_xmit_hash_xdp(bond, xdp); slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (unlikely(!count)) return NULL; return slaves->arr[hash % count]; } /* Use this Xmit function for 3AD as well as XOR modes. The current * usable slave array is formed in the control path. The xmit function * just calculates hash and sends the packet out. */ static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); struct bond_up_slave *slaves; struct slave *slave; slaves = rcu_dereference(bond->usable_slaves); slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves); if (likely(slave)) return bond_dev_queue_xmit(bond, skb, slave->dev); return bond_tx_drop(dev, skb); } /* in broadcast mode, we send everything to all usable interfaces. */ static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave = NULL; struct list_head *iter; bool xmit_suc = false; bool skb_used = false; bond_for_each_slave_rcu(bond, slave, iter) { struct sk_buff *skb2; if (!(bond_slave_is_up(slave) && slave->link == BOND_LINK_UP)) continue; if (bond_is_last_slave(bond, slave)) { skb2 = skb; skb_used = true; } else { skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) { net_err_ratelimited("%s: Error: %s: skb_clone() failed\n", bond_dev->name, __func__); continue; } } if (bond_dev_queue_xmit(bond, skb2, slave->dev) == NETDEV_TX_OK) xmit_suc = true; } if (!skb_used) dev_kfree_skb_any(skb); if (xmit_suc) return NETDEV_TX_OK; atomic_long_inc(&bond_dev->tx_dropped); return NET_XMIT_DROP; } /*------------------------- Device initialization ---------------------------*/ /* Lookup the slave that corresponds to a qid */ static inline int bond_slave_override(struct bonding *bond, struct sk_buff *skb) { struct slave *slave = NULL; struct list_head *iter; if (!skb_rx_queue_recorded(skb)) return 1; /* Find out if any slaves have the same mapping as this skb. */ bond_for_each_slave_rcu(bond, slave, iter) { if (slave->queue_id == skb_get_queue_mapping(skb)) { if (bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) { bond_dev_queue_xmit(bond, skb, slave->dev); return 0; } /* If the slave isn't UP, use default transmit policy. */ break; } } return 1; } static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { /* This helper function exists to help dev_pick_tx get the correct * destination queue. Using a helper function skips a call to * skb_tx_hash and will put the skbs in the queue we expect on their * way down to the bonding driver. */ u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0; /* Save the original txq to restore before passing to the driver */ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb_get_queue_mapping(skb); if (unlikely(txq >= dev->real_num_tx_queues)) { do { txq -= dev->real_num_tx_queues; } while (txq >= dev->real_num_tx_queues); } return txq; } static struct net_device *bond_xmit_get_slave(struct net_device *master_dev, struct sk_buff *skb, bool all_slaves) { struct bonding *bond = netdev_priv(master_dev); struct bond_up_slave *slaves; struct slave *slave = NULL; switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: slave = bond_xmit_roundrobin_slave_get(bond, skb); break; case BOND_MODE_ACTIVEBACKUP: slave = bond_xmit_activebackup_slave_get(bond); break; case BOND_MODE_8023AD: case BOND_MODE_XOR: if (all_slaves) slaves = rcu_dereference(bond->all_slaves); else slaves = rcu_dereference(bond->usable_slaves); slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves); break; case BOND_MODE_BROADCAST: break; case BOND_MODE_ALB: slave = bond_xmit_alb_slave_get(bond, skb); break; case BOND_MODE_TLB: slave = bond_xmit_tlb_slave_get(bond, skb); break; default: /* Should never happen, mode already checked */ WARN_ONCE(true, "Unknown bonding mode"); break; } if (slave) return slave->dev; return NULL; } static void bond_sk_to_flow(struct sock *sk, struct flow_keys *flow) { switch (sk->sk_family) { #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: if (sk->sk_ipv6only || ipv6_addr_type(&sk->sk_v6_daddr) != IPV6_ADDR_MAPPED) { flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; flow->addrs.v6addrs.src = inet6_sk(sk)->saddr; flow->addrs.v6addrs.dst = sk->sk_v6_daddr; break; } fallthrough; #endif default: /* AF_INET */ flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; flow->addrs.v4addrs.src = inet_sk(sk)->inet_rcv_saddr; flow->addrs.v4addrs.dst = inet_sk(sk)->inet_daddr; break; } flow->ports.src = inet_sk(sk)->inet_sport; flow->ports.dst = inet_sk(sk)->inet_dport; } /** * bond_sk_hash_l34 - generate a hash value based on the socket's L3 and L4 fields * @sk: socket to use for headers * * This function will extract the necessary field from the socket and use * them to generate a hash based on the LAYER34 xmit_policy. * Assumes that sk is a TCP or UDP socket. */ static u32 bond_sk_hash_l34(struct sock *sk) { struct flow_keys flow; u32 hash; bond_sk_to_flow(sk, &flow); /* L4 */ memcpy(&hash, &flow.ports.ports, sizeof(hash)); /* L3 */ return bond_ip_hash(hash, &flow, BOND_XMIT_POLICY_LAYER34); } static struct net_device *__bond_sk_get_lower_dev(struct bonding *bond, struct sock *sk) { struct bond_up_slave *slaves; struct slave *slave; unsigned int count; u32 hash; slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (unlikely(!count)) return NULL; hash = bond_sk_hash_l34(sk); slave = slaves->arr[hash % count]; return slave->dev; } static struct net_device *bond_sk_get_lower_dev(struct net_device *dev, struct sock *sk) { struct bonding *bond = netdev_priv(dev); struct net_device *lower = NULL; rcu_read_lock(); if (bond_sk_check(bond)) lower = __bond_sk_get_lower_dev(bond, sk); rcu_read_unlock(); return lower; } #if IS_ENABLED(CONFIG_TLS_DEVICE) static netdev_tx_t bond_tls_device_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *dev) { if (likely(bond_get_slave_by_dev(bond, tls_get_ctx(skb->sk)->netdev))) return bond_dev_queue_xmit(bond, skb, tls_get_ctx(skb->sk)->netdev); return bond_tx_drop(dev, skb); } #endif static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); if (bond_should_override_tx_queue(bond) && !bond_slave_override(bond, skb)) return NETDEV_TX_OK; #if IS_ENABLED(CONFIG_TLS_DEVICE) if (skb->sk && tls_is_sk_tx_device_offloaded(skb->sk)) return bond_tls_device_xmit(bond, skb, dev); #endif switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: return bond_xmit_roundrobin(skb, dev); case BOND_MODE_ACTIVEBACKUP: return bond_xmit_activebackup(skb, dev); case BOND_MODE_8023AD: case BOND_MODE_XOR: return bond_3ad_xor_xmit(skb, dev); case BOND_MODE_BROADCAST: return bond_xmit_broadcast(skb, dev); case BOND_MODE_ALB: return bond_alb_xmit(skb, dev); case BOND_MODE_TLB: return bond_tlb_xmit(skb, dev); default: /* Should never happen, mode already checked */ netdev_err(dev, "Unknown bonding mode %d\n", BOND_MODE(bond)); WARN_ON_ONCE(1); return bond_tx_drop(dev, skb); } } static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); netdev_tx_t ret = NETDEV_TX_OK; /* If we risk deadlock from transmitting this in the * netpoll path, tell netpoll to queue the frame for later tx */ if (unlikely(is_netpoll_tx_blocked(dev))) return NETDEV_TX_BUSY; rcu_read_lock(); if (bond_has_slaves(bond)) ret = __bond_start_xmit(skb, dev); else ret = bond_tx_drop(dev, skb); rcu_read_unlock(); return ret; } static struct net_device * bond_xdp_get_xmit_slave(struct net_device *bond_dev, struct xdp_buff *xdp) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; /* Caller needs to hold rcu_read_lock() */ switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: slave = bond_xdp_xmit_roundrobin_slave_get(bond, xdp); break; case BOND_MODE_ACTIVEBACKUP: slave = bond_xmit_activebackup_slave_get(bond); break; case BOND_MODE_8023AD: case BOND_MODE_XOR: slave = bond_xdp_xmit_3ad_xor_slave_get(bond, xdp); break; default: if (net_ratelimit()) netdev_err(bond_dev, "Unknown bonding mode %d for xdp xmit\n", BOND_MODE(bond)); return NULL; } if (slave) return slave->dev; return NULL; } static int bond_xdp_xmit(struct net_device *bond_dev, int n, struct xdp_frame **frames, u32 flags) { int nxmit, err = -ENXIO; rcu_read_lock(); for (nxmit = 0; nxmit < n; nxmit++) { struct xdp_frame *frame = frames[nxmit]; struct xdp_frame *frames1[] = {frame}; struct net_device *slave_dev; struct xdp_buff xdp; xdp_convert_frame_to_buff(frame, &xdp); slave_dev = bond_xdp_get_xmit_slave(bond_dev, &xdp); if (!slave_dev) { err = -ENXIO; break; } err = slave_dev->netdev_ops->ndo_xdp_xmit(slave_dev, 1, frames1, flags); if (err < 1) break; } rcu_read_unlock(); /* If error happened on the first frame then we can pass the error up, otherwise * report the number of frames that were xmitted. */ if (err < 0) return (nxmit == 0 ? err : nxmit); return nxmit; } static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; struct slave *slave, *rollback_slave; struct bpf_prog *old_prog; struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, .prog = prog, .extack = extack, }; int err; ASSERT_RTNL(); if (!bond_xdp_check(bond)) return -EOPNOTSUPP; old_prog = bond->xdp_prog; bond->xdp_prog = prog; bond_for_each_slave(bond, slave, iter) { struct net_device *slave_dev = slave->dev; if (!slave_dev->netdev_ops->ndo_bpf || !slave_dev->netdev_ops->ndo_xdp_xmit) { SLAVE_NL_ERR(dev, slave_dev, extack, "Slave device does not support XDP"); err = -EOPNOTSUPP; goto err; } if (dev_xdp_prog_count(slave_dev) > 0) { SLAVE_NL_ERR(dev, slave_dev, extack, "Slave has XDP program loaded, please unload before enslaving"); err = -EOPNOTSUPP; goto err; } err = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp); if (err < 0) { /* ndo_bpf() sets extack error message */ slave_err(dev, slave_dev, "Error %d calling ndo_bpf\n", err); goto err; } if (prog) bpf_prog_inc(prog); } if (prog) { static_branch_inc(&bpf_master_redirect_enabled_key); } else if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&bpf_master_redirect_enabled_key); } return 0; err: /* unwind the program changes */ bond->xdp_prog = old_prog; xdp.prog = old_prog; xdp.extack = NULL; /* do not overwrite original error */ bond_for_each_slave(bond, rollback_slave, iter) { struct net_device *slave_dev = rollback_slave->dev; int err_unwind; if (slave == rollback_slave) break; err_unwind = slave_dev->netdev_ops->ndo_bpf(slave_dev, &xdp); if (err_unwind < 0) slave_err(dev, slave_dev, "Error %d when unwinding XDP program change\n", err_unwind); else if (xdp.prog) bpf_prog_inc(xdp.prog); } return err; } static int bond_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return bond_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed) { if (speed == 0 || speed == SPEED_UNKNOWN) speed = slave->speed; else speed = min(speed, slave->speed); return speed; } static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev, struct ethtool_link_ksettings *cmd) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; u32 speed = 0; cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; /* Since bond_slave_can_tx returns false for all inactive or down slaves, we * do not need to check mode. Though link speed might not represent * the true receive or transmit bandwidth (not all modes are symmetric) * this is an accurate maximum. */ bond_for_each_slave(bond, slave, iter) { if (bond_slave_can_tx(slave)) { if (slave->speed != SPEED_UNKNOWN) { if (BOND_MODE(bond) == BOND_MODE_BROADCAST) speed = bond_mode_bcast_speed(slave, speed); else speed += slave->speed; } if (cmd->base.duplex == DUPLEX_UNKNOWN && slave->duplex != DUPLEX_UNKNOWN) cmd->base.duplex = slave->duplex; } } cmd->base.speed = speed ? : SPEED_UNKNOWN; return 0; } static void bond_ethtool_get_drvinfo(struct net_device *bond_dev, struct ethtool_drvinfo *drvinfo) { strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%d", BOND_ABI_VERSION); } static const struct ethtool_ops bond_ethtool_ops = { .get_drvinfo = bond_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = bond_ethtool_get_link_ksettings, }; static const struct net_device_ops bond_netdev_ops = { .ndo_init = bond_init, .ndo_uninit = bond_uninit, .ndo_open = bond_open, .ndo_stop = bond_close, .ndo_start_xmit = bond_start_xmit, .ndo_select_queue = bond_select_queue, .ndo_get_stats64 = bond_get_stats, .ndo_eth_ioctl = bond_eth_ioctl, .ndo_siocbond = bond_do_ioctl, .ndo_siocdevprivate = bond_siocdevprivate, .ndo_change_rx_flags = bond_change_rx_flags, .ndo_set_rx_mode = bond_set_rx_mode, .ndo_change_mtu = bond_change_mtu, .ndo_set_mac_address = bond_set_mac_address, .ndo_neigh_setup = bond_neigh_setup, .ndo_vlan_rx_add_vid = bond_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = bond_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = bond_netpoll_setup, .ndo_netpoll_cleanup = bond_netpoll_cleanup, .ndo_poll_controller = bond_poll_controller, #endif .ndo_add_slave = bond_enslave, .ndo_del_slave = bond_release, .ndo_fix_features = bond_fix_features, .ndo_features_check = passthru_features_check, .ndo_get_xmit_slave = bond_xmit_get_slave, .ndo_sk_get_lower_dev = bond_sk_get_lower_dev, .ndo_bpf = bond_xdp, .ndo_xdp_xmit = bond_xdp_xmit, .ndo_xdp_get_xmit_slave = bond_xdp_get_xmit_slave, }; static const struct device_type bond_type = { .name = "bond", }; static void bond_destructor(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); if (bond->wq) destroy_workqueue(bond->wq); if (bond->rr_tx_counter) free_percpu(bond->rr_tx_counter); } void bond_setup(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); spin_lock_init(&bond->mode_lock); bond->params = bonding_defaults; /* Initialize pointers */ bond->dev = bond_dev; /* Initialize the device entry points */ ether_setup(bond_dev); bond_dev->max_mtu = ETH_MAX_MTU; bond_dev->netdev_ops = &bond_netdev_ops; bond_dev->ethtool_ops = &bond_ethtool_ops; bond_dev->needs_free_netdev = true; bond_dev->priv_destructor = bond_destructor; SET_NETDEV_DEVTYPE(bond_dev, &bond_type); /* Initialize the device options */ bond_dev->flags |= IFF_MASTER; bond_dev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT | IFF_NO_QUEUE; bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); #ifdef CONFIG_XFRM_OFFLOAD /* set up xfrm device ops (only supported in active-backup right now) */ bond_dev->xfrmdev_ops = &bond_xfrmdev_ops; INIT_LIST_HEAD(&bond->ipsec_list); spin_lock_init(&bond->ipsec_lock); #endif /* CONFIG_XFRM_OFFLOAD */ /* don't acquire bond device's netif_tx_lock when transmitting */ bond_dev->features |= NETIF_F_LLTX; /* By default, we declare the bond to be fully * VLAN hardware accelerated capable. Special * care is taken in the various xmit functions * when there are slaves that are not hw accel * capable */ /* Don't allow bond devices to change network namespaces. */ bond_dev->features |= NETIF_F_NETNS_LOCAL; bond_dev->hw_features = BOND_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_FILTER; bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; bond_dev->features |= bond_dev->hw_features; bond_dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; #ifdef CONFIG_XFRM_OFFLOAD bond_dev->hw_features |= BOND_XFRM_FEATURES; /* Only enable XFRM features if this is an active-backup config */ if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) bond_dev->features |= BOND_XFRM_FEATURES; #endif /* CONFIG_XFRM_OFFLOAD */ #if IS_ENABLED(CONFIG_TLS_DEVICE) if (bond_sk_check(bond)) bond_dev->features |= BOND_TLS_FEATURES; #endif } /* Destroy a bonding device. * Must be under rtnl_lock when this function is called. */ static void bond_uninit(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct bond_up_slave *usable, *all; struct list_head *iter; struct slave *slave; bond_netpoll_cleanup(bond_dev); /* Release the bonded slaves */ bond_for_each_slave(bond, slave, iter) __bond_release_one(bond_dev, slave->dev, true, true); netdev_info(bond_dev, "Released all slaves\n"); usable = rtnl_dereference(bond->usable_slaves); if (usable) { RCU_INIT_POINTER(bond->usable_slaves, NULL); kfree_rcu(usable, rcu); } all = rtnl_dereference(bond->all_slaves); if (all) { RCU_INIT_POINTER(bond->all_slaves, NULL); kfree_rcu(all, rcu); } list_del(&bond->bond_list); bond_debug_unregister(bond); } /*------------------------- Module initialization ---------------------------*/ static int bond_check_params(struct bond_params *params) { int arp_validate_value, fail_over_mac_value, primary_reselect_value, i; struct bond_opt_value newval; const struct bond_opt_value *valptr; int arp_all_targets_value = 0; u16 ad_actor_sys_prio = 0; u16 ad_user_port_key = 0; __be32 arp_target[BOND_MAX_ARP_TARGETS] = { 0 }; int arp_ip_count; int bond_mode = BOND_MODE_ROUNDROBIN; int xmit_hashtype = BOND_XMIT_POLICY_LAYER2; int lacp_fast = 0; int tlb_dynamic_lb; /* Convert string parameters. */ if (mode) { bond_opt_initstr(&newval, mode); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_MODE), &newval); if (!valptr) { pr_err("Error: Invalid bonding mode \"%s\"\n", mode); return -EINVAL; } bond_mode = valptr->value; } if (xmit_hash_policy) { if (bond_mode == BOND_MODE_ROUNDROBIN || bond_mode == BOND_MODE_ACTIVEBACKUP || bond_mode == BOND_MODE_BROADCAST) { pr_info("xmit_hash_policy param is irrelevant in mode %s\n", bond_mode_name(bond_mode)); } else { bond_opt_initstr(&newval, xmit_hash_policy); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_XMIT_HASH), &newval); if (!valptr) { pr_err("Error: Invalid xmit_hash_policy \"%s\"\n", xmit_hash_policy); return -EINVAL; } xmit_hashtype = valptr->value; } } if (lacp_rate) { if (bond_mode != BOND_MODE_8023AD) { pr_info("lacp_rate param is irrelevant in mode %s\n", bond_mode_name(bond_mode)); } else { bond_opt_initstr(&newval, lacp_rate); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_LACP_RATE), &newval); if (!valptr) { pr_err("Error: Invalid lacp rate \"%s\"\n", lacp_rate); return -EINVAL; } lacp_fast = valptr->value; } } if (ad_select) { bond_opt_initstr(&newval, ad_select); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_SELECT), &newval); if (!valptr) { pr_err("Error: Invalid ad_select \"%s\"\n", ad_select); return -EINVAL; } params->ad_select = valptr->value; if (bond_mode != BOND_MODE_8023AD) pr_warn("ad_select param only affects 802.3ad mode\n"); } else { params->ad_select = BOND_AD_STABLE; } if (max_bonds < 0) { pr_warn("Warning: max_bonds (%d) not in range %d-%d, so it was reset to BOND_DEFAULT_MAX_BONDS (%d)\n", max_bonds, 0, INT_MAX, BOND_DEFAULT_MAX_BONDS); max_bonds = BOND_DEFAULT_MAX_BONDS; } if (miimon < 0) { pr_warn("Warning: miimon module parameter (%d), not in range 0-%d, so it was reset to 0\n", miimon, INT_MAX); miimon = 0; } if (updelay < 0) { pr_warn("Warning: updelay module parameter (%d), not in range 0-%d, so it was reset to 0\n", updelay, INT_MAX); updelay = 0; } if (downdelay < 0) { pr_warn("Warning: downdelay module parameter (%d), not in range 0-%d, so it was reset to 0\n", downdelay, INT_MAX); downdelay = 0; } if ((use_carrier != 0) && (use_carrier != 1)) { pr_warn("Warning: use_carrier module parameter (%d), not of valid value (0/1), so it was set to 1\n", use_carrier); use_carrier = 1; } if (num_peer_notif < 0 || num_peer_notif > 255) { pr_warn("Warning: num_grat_arp/num_unsol_na (%d) not in range 0-255 so it was reset to 1\n", num_peer_notif); num_peer_notif = 1; } /* reset values for 802.3ad/TLB/ALB */ if (!bond_mode_uses_arp(bond_mode)) { if (!miimon) { pr_warn("Warning: miimon must be specified, otherwise bonding will not detect link failure, speed and duplex which are essential for 802.3ad operation\n"); pr_warn("Forcing miimon to 100msec\n"); miimon = BOND_DEFAULT_MIIMON; } } if (tx_queues < 1 || tx_queues > 255) { pr_warn("Warning: tx_queues (%d) should be between 1 and 255, resetting to %d\n", tx_queues, BOND_DEFAULT_TX_QUEUES); tx_queues = BOND_DEFAULT_TX_QUEUES; } if ((all_slaves_active != 0) && (all_slaves_active != 1)) { pr_warn("Warning: all_slaves_active module parameter (%d), not of valid value (0/1), so it was set to 0\n", all_slaves_active); all_slaves_active = 0; } if (resend_igmp < 0 || resend_igmp > 255) { pr_warn("Warning: resend_igmp (%d) should be between 0 and 255, resetting to %d\n", resend_igmp, BOND_DEFAULT_RESEND_IGMP); resend_igmp = BOND_DEFAULT_RESEND_IGMP; } bond_opt_initval(&newval, packets_per_slave); if (!bond_opt_parse(bond_opt_get(BOND_OPT_PACKETS_PER_SLAVE), &newval)) { pr_warn("Warning: packets_per_slave (%d) should be between 0 and %u resetting to 1\n", packets_per_slave, USHRT_MAX); packets_per_slave = 1; } if (bond_mode == BOND_MODE_ALB) { pr_notice("In ALB mode you might experience client disconnections upon reconnection of a link if the bonding module updelay parameter (%d msec) is incompatible with the forwarding delay time of the switch\n", updelay); } if (!miimon) { if (updelay || downdelay) { /* just warn the user the up/down delay will have * no effect since miimon is zero... */ pr_warn("Warning: miimon module parameter not set and updelay (%d) or downdelay (%d) module parameter is set; updelay and downdelay have no effect unless miimon is set\n", updelay, downdelay); } } else { /* don't allow arp monitoring */ if (arp_interval) { pr_warn("Warning: miimon (%d) and arp_interval (%d) can't be used simultaneously, disabling ARP monitoring\n", miimon, arp_interval); arp_interval = 0; } if ((updelay % miimon) != 0) { pr_warn("Warning: updelay (%d) is not a multiple of miimon (%d), updelay rounded to %d ms\n", updelay, miimon, (updelay / miimon) * miimon); } updelay /= miimon; if ((downdelay % miimon) != 0) { pr_warn("Warning: downdelay (%d) is not a multiple of miimon (%d), downdelay rounded to %d ms\n", downdelay, miimon, (downdelay / miimon) * miimon); } downdelay /= miimon; } if (arp_interval < 0) { pr_warn("Warning: arp_interval module parameter (%d), not in range 0-%d, so it was reset to 0\n", arp_interval, INT_MAX); arp_interval = 0; } for (arp_ip_count = 0, i = 0; (arp_ip_count < BOND_MAX_ARP_TARGETS) && arp_ip_target[i]; i++) { __be32 ip; /* not a complete check, but good enough to catch mistakes */ if (!in4_pton(arp_ip_target[i], -1, (u8 *)&ip, -1, NULL) || !bond_is_ip_target_ok(ip)) { pr_warn("Warning: bad arp_ip_target module parameter (%s), ARP monitoring will not be performed\n", arp_ip_target[i]); arp_interval = 0; } else { if (bond_get_targets_ip(arp_target, ip) == -1) arp_target[arp_ip_count++] = ip; else pr_warn("Warning: duplicate address %pI4 in arp_ip_target, skipping\n", &ip); } } if (arp_interval && !arp_ip_count) { /* don't allow arping if no arp_ip_target given... */ pr_warn("Warning: arp_interval module parameter (%d) specified without providing an arp_ip_target parameter, arp_interval was reset to 0\n", arp_interval); arp_interval = 0; } if (arp_validate) { if (!arp_interval) { pr_err("arp_validate requires arp_interval\n"); return -EINVAL; } bond_opt_initstr(&newval, arp_validate); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_VALIDATE), &newval); if (!valptr) { pr_err("Error: invalid arp_validate \"%s\"\n", arp_validate); return -EINVAL; } arp_validate_value = valptr->value; } else { arp_validate_value = 0; } if (arp_all_targets) { bond_opt_initstr(&newval, arp_all_targets); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_ALL_TARGETS), &newval); if (!valptr) { pr_err("Error: invalid arp_all_targets_value \"%s\"\n", arp_all_targets); arp_all_targets_value = 0; } else { arp_all_targets_value = valptr->value; } } if (miimon) { pr_info("MII link monitoring set to %d ms\n", miimon); } else if (arp_interval) { valptr = bond_opt_get_val(BOND_OPT_ARP_VALIDATE, arp_validate_value); pr_info("ARP monitoring set to %d ms, validate %s, with %d target(s):", arp_interval, valptr->string, arp_ip_count); for (i = 0; i < arp_ip_count; i++) pr_cont(" %s", arp_ip_target[i]); pr_cont("\n"); } else if (max_bonds) { /* miimon and arp_interval not set, we need one so things * work as expected, see bonding.txt for details */ pr_debug("Warning: either miimon or arp_interval and arp_ip_target module parameters must be specified, otherwise bonding will not detect link failures! see bonding.txt for details\n"); } if (primary && !bond_mode_uses_primary(bond_mode)) { /* currently, using a primary only makes sense * in active backup, TLB or ALB modes */ pr_warn("Warning: %s primary device specified but has no effect in %s mode\n", primary, bond_mode_name(bond_mode)); primary = NULL; } if (primary && primary_reselect) { bond_opt_initstr(&newval, primary_reselect); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_PRIMARY_RESELECT), &newval); if (!valptr) { pr_err("Error: Invalid primary_reselect \"%s\"\n", primary_reselect); return -EINVAL; } primary_reselect_value = valptr->value; } else { primary_reselect_value = BOND_PRI_RESELECT_ALWAYS; } if (fail_over_mac) { bond_opt_initstr(&newval, fail_over_mac); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_FAIL_OVER_MAC), &newval); if (!valptr) { pr_err("Error: invalid fail_over_mac \"%s\"\n", fail_over_mac); return -EINVAL; } fail_over_mac_value = valptr->value; if (bond_mode != BOND_MODE_ACTIVEBACKUP) pr_warn("Warning: fail_over_mac only affects active-backup mode\n"); } else { fail_over_mac_value = BOND_FOM_NONE; } bond_opt_initstr(&newval, "default"); valptr = bond_opt_parse( bond_opt_get(BOND_OPT_AD_ACTOR_SYS_PRIO), &newval); if (!valptr) { pr_err("Error: No ad_actor_sys_prio default value"); return -EINVAL; } ad_actor_sys_prio = valptr->value; valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_USER_PORT_KEY), &newval); if (!valptr) { pr_err("Error: No ad_user_port_key default value"); return -EINVAL; } ad_user_port_key = valptr->value; bond_opt_initstr(&newval, "default"); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), &newval); if (!valptr) { pr_err("Error: No tlb_dynamic_lb default value"); return -EINVAL; } tlb_dynamic_lb = valptr->value; if (lp_interval == 0) { pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n", INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL); lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL; } /* fill params struct with the proper values */ params->mode = bond_mode; params->xmit_policy = xmit_hashtype; params->miimon = miimon; params->num_peer_notif = num_peer_notif; params->arp_interval = arp_interval; params->arp_validate = arp_validate_value; params->arp_all_targets = arp_all_targets_value; params->missed_max = 2; params->updelay = updelay; params->downdelay = downdelay; params->peer_notif_delay = 0; params->use_carrier = use_carrier; params->lacp_active = 1; params->lacp_fast = lacp_fast; params->primary[0] = 0; params->primary_reselect = primary_reselect_value; params->fail_over_mac = fail_over_mac_value; params->tx_queues = tx_queues; params->all_slaves_active = all_slaves_active; params->resend_igmp = resend_igmp; params->min_links = min_links; params->lp_interval = lp_interval; params->packets_per_slave = packets_per_slave; params->tlb_dynamic_lb = tlb_dynamic_lb; params->ad_actor_sys_prio = ad_actor_sys_prio; eth_zero_addr(params->ad_actor_system); params->ad_user_port_key = ad_user_port_key; if (packets_per_slave > 0) { params->reciprocal_packets_per_slave = reciprocal_value(packets_per_slave); } else { /* reciprocal_packets_per_slave is unused if * packets_per_slave is 0 or 1, just initialize it */ params->reciprocal_packets_per_slave = (struct reciprocal_value) { 0 }; } if (primary) strscpy_pad(params->primary, primary, sizeof(params->primary)); memcpy(params->arp_targets, arp_target, sizeof(arp_target)); return 0; } /* Called from registration process */ static int bond_init(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); netdev_dbg(bond_dev, "Begin bond_init\n"); bond->wq = alloc_ordered_workqueue(bond_dev->name, WQ_MEM_RECLAIM); if (!bond->wq) return -ENOMEM; bond->notifier_ctx = false; spin_lock_init(&bond->stats_lock); netdev_lockdep_set_classes(bond_dev); list_add_tail(&bond->bond_list, &bn->dev_list); bond_prepare_sysfs_group(bond); bond_debug_register(bond); /* Ensure valid dev_addr */ if (is_zero_ether_addr(bond_dev->dev_addr) && bond_dev->addr_assign_type == NET_ADDR_PERM) eth_hw_addr_random(bond_dev); return 0; } unsigned int bond_get_num_tx_queues(void) { return tx_queues; } /* Create a new bond based on the specified name and bonding parameters. * If name is NULL, obtain a suitable "bond%d" name for us. * Caller must NOT hold rtnl_lock; we need to release it here before we * set up our sysfs entries. */ int bond_create(struct net *net, const char *name) { struct net_device *bond_dev; struct bonding *bond; struct alb_bond_info *bond_info; int res; rtnl_lock(); bond_dev = alloc_netdev_mq(sizeof(struct bonding), name ? name : "bond%d", NET_NAME_UNKNOWN, bond_setup, tx_queues); if (!bond_dev) { pr_err("%s: eek! can't alloc netdev!\n", name); rtnl_unlock(); return -ENOMEM; } /* * Initialize rx_hashtbl_used_head to RLB_NULL_INDEX. * It is set to 0 by default which is wrong. */ bond = netdev_priv(bond_dev); bond_info = &(BOND_ALB_INFO(bond)); bond_info->rx_hashtbl_used_head = RLB_NULL_INDEX; dev_net_set(bond_dev, net); bond_dev->rtnl_link_ops = &bond_link_ops; res = register_netdevice(bond_dev); if (res < 0) { free_netdev(bond_dev); rtnl_unlock(); return res; } netif_carrier_off(bond_dev); bond_work_init_all(bond); rtnl_unlock(); return 0; } static int __net_init bond_net_init(struct net *net) { struct bond_net *bn = net_generic(net, bond_net_id); bn->net = net; INIT_LIST_HEAD(&bn->dev_list); bond_create_proc_dir(bn); bond_create_sysfs(bn); return 0; } static void __net_exit bond_net_exit(struct net *net) { struct bond_net *bn = net_generic(net, bond_net_id); struct bonding *bond, *tmp_bond; LIST_HEAD(list); bond_destroy_sysfs(bn); /* Kill off any bonds created after unregistering bond rtnl ops */ rtnl_lock(); list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list) unregister_netdevice_queue(bond->dev, &list); unregister_netdevice_many(&list); rtnl_unlock(); bond_destroy_proc_dir(bn); } static struct pernet_operations bond_net_ops = { .init = bond_net_init, .exit = bond_net_exit, .id = &bond_net_id, .size = sizeof(struct bond_net), }; static int __init bonding_init(void) { int i; int res; res = bond_check_params(&bonding_defaults); if (res) goto out; res = register_pernet_subsys(&bond_net_ops); if (res) goto out; res = bond_netlink_init(); if (res) goto err_link; bond_create_debugfs(); for (i = 0; i < max_bonds; i++) { res = bond_create(&init_net, NULL); if (res) goto err; } skb_flow_dissector_init(&flow_keys_bonding, flow_keys_bonding_keys, ARRAY_SIZE(flow_keys_bonding_keys)); register_netdevice_notifier(&bond_netdev_notifier); out: return res; err: bond_destroy_debugfs(); bond_netlink_fini(); err_link: unregister_pernet_subsys(&bond_net_ops); goto out; } static void __exit bonding_exit(void) { unregister_netdevice_notifier(&bond_netdev_notifier); bond_destroy_debugfs(); bond_netlink_fini(); unregister_pernet_subsys(&bond_net_ops); #ifdef CONFIG_NET_POLL_CONTROLLER /* Make sure we don't have an imbalance on our netpoll blocking */ WARN_ON(atomic_read(&netpoll_block_tx)); #endif } module_init(bonding_init); module_exit(bonding_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR("Thomas Davis, tadavis@lbl.gov and many others");
343 343 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2013 Politecnico di Torino, Italy * TORSEC group -- https://security.polito.it * * Author: Roberto Sassu <roberto.sassu@polito.it> * * File: ima_template.c * Helpers to manage template descriptors. */ #include <linux/rculist.h> #include "ima.h" #include "ima_template_lib.h" enum header_fields { HDR_PCR, HDR_DIGEST, HDR_TEMPLATE_NAME, HDR_TEMPLATE_DATA, HDR__LAST }; static struct ima_template_desc builtin_templates[] = { {.name = IMA_TEMPLATE_IMA_NAME, .fmt = IMA_TEMPLATE_IMA_FMT}, {.name = "ima-ng", .fmt = "d-ng|n-ng"}, {.name = "ima-sig", .fmt = "d-ng|n-ng|sig"}, {.name = "ima-buf", .fmt = "d-ng|n-ng|buf"}, {.name = "ima-modsig", .fmt = "d-ng|n-ng|sig|d-modsig|modsig"}, {.name = "evm-sig", .fmt = "d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode"}, {.name = "", .fmt = ""}, /* placeholder for a custom format */ }; static LIST_HEAD(defined_templates); static DEFINE_SPINLOCK(template_list); static int template_setup_done; static const struct ima_template_field supported_fields[] = { {.field_id = "d", .field_init = ima_eventdigest_init, .field_show = ima_show_template_digest}, {.field_id = "n", .field_init = ima_eventname_init, .field_show = ima_show_template_string}, {.field_id = "d-ng", .field_init = ima_eventdigest_ng_init, .field_show = ima_show_template_digest_ng}, {.field_id = "n-ng", .field_init = ima_eventname_ng_init, .field_show = ima_show_template_string}, {.field_id = "sig", .field_init = ima_eventsig_init, .field_show = ima_show_template_sig}, {.field_id = "buf", .field_init = ima_eventbuf_init, .field_show = ima_show_template_buf}, {.field_id = "d-modsig", .field_init = ima_eventdigest_modsig_init, .field_show = ima_show_template_digest_ng}, {.field_id = "modsig", .field_init = ima_eventmodsig_init, .field_show = ima_show_template_sig}, {.field_id = "evmsig", .field_init = ima_eventevmsig_init, .field_show = ima_show_template_sig}, {.field_id = "iuid", .field_init = ima_eventinodeuid_init, .field_show = ima_show_template_uint}, {.field_id = "igid", .field_init = ima_eventinodegid_init, .field_show = ima_show_template_uint}, {.field_id = "imode", .field_init = ima_eventinodemode_init, .field_show = ima_show_template_uint}, {.field_id = "xattrnames", .field_init = ima_eventinodexattrnames_init, .field_show = ima_show_template_string}, {.field_id = "xattrlengths", .field_init = ima_eventinodexattrlengths_init, .field_show = ima_show_template_sig}, {.field_id = "xattrvalues", .field_init = ima_eventinodexattrvalues_init, .field_show = ima_show_template_sig}, }; /* * Used when restoring measurements carried over from a kexec. 'd' and 'n' don't * need to be accounted for since they shouldn't be defined in the same template * description as 'd-ng' and 'n-ng' respectively. */ #define MAX_TEMPLATE_NAME_LEN \ sizeof("d-ng|n-ng|evmsig|xattrnames|xattrlengths|xattrvalues|iuid|igid|imode") static struct ima_template_desc *ima_template; static struct ima_template_desc *ima_buf_template; /** * ima_template_has_modsig - Check whether template has modsig-related fields. * @ima_template: IMA template to check. * * Tells whether the given template has fields referencing a file's appended * signature. */ bool ima_template_has_modsig(const struct ima_template_desc *ima_template) { int i; for (i = 0; i < ima_template->num_fields; i++) if (!strcmp(ima_template->fields[i]->field_id, "modsig") || !strcmp(ima_template->fields[i]->field_id, "d-modsig")) return true; return false; } static int __init ima_template_setup(char *str) { struct ima_template_desc *template_desc; int template_len = strlen(str); if (template_setup_done) return 1; if (!ima_template) ima_init_template_list(); /* * Verify that a template with the supplied name exists. * If not, use CONFIG_IMA_DEFAULT_TEMPLATE. */ template_desc = lookup_template_desc(str); if (!template_desc) { pr_err("template %s not found, using %s\n", str, CONFIG_IMA_DEFAULT_TEMPLATE); return 1; } /* * Verify whether the current hash algorithm is supported * by the 'ima' template. */ if (template_len == 3 && strcmp(str, IMA_TEMPLATE_IMA_NAME) == 0 && ima_hash_algo != HASH_ALGO_SHA1 && ima_hash_algo != HASH_ALGO_MD5) { pr_err("template does not support hash alg\n"); return 1; } ima_template = template_desc; template_setup_done = 1; return 1; } __setup("ima_template=", ima_template_setup); static int __init ima_template_fmt_setup(char *str) { int num_templates = ARRAY_SIZE(builtin_templates); if (template_setup_done) return 1; if (template_desc_init_fields(str, NULL, NULL) < 0) { pr_err("format string '%s' not valid, using template %s\n", str, CONFIG_IMA_DEFAULT_TEMPLATE); return 1; } builtin_templates[num_templates - 1].fmt = str; ima_template = builtin_templates + num_templates - 1; template_setup_done = 1; return 1; } __setup("ima_template_fmt=", ima_template_fmt_setup); struct ima_template_desc *lookup_template_desc(const char *name) { struct ima_template_desc *template_desc; int found = 0; rcu_read_lock(); list_for_each_entry_rcu(template_desc, &defined_templates, list) { if ((strcmp(template_desc->name, name) == 0) || (strcmp(template_desc->fmt, name) == 0)) { found = 1; break; } } rcu_read_unlock(); return found ? template_desc : NULL; } static const struct ima_template_field * lookup_template_field(const char *field_id) { int i; for (i = 0; i < ARRAY_SIZE(supported_fields); i++) if (strncmp(supported_fields[i].field_id, field_id, IMA_TEMPLATE_FIELD_ID_MAX_LEN) == 0) return &supported_fields[i]; return NULL; } static int template_fmt_size(const char *template_fmt) { char c; int template_fmt_len = strlen(template_fmt); int i = 0, j = 0; while (i < template_fmt_len) { c = template_fmt[i]; if (c == '|') j++; i++; } return j + 1; } int template_desc_init_fields(const char *template_fmt, const struct ima_template_field ***fields, int *num_fields) { const char *template_fmt_ptr; const struct ima_template_field *found_fields[IMA_TEMPLATE_NUM_FIELDS_MAX]; int template_num_fields; int i, len; if (num_fields && *num_fields > 0) /* already initialized? */ return 0; template_num_fields = template_fmt_size(template_fmt); if (template_num_fields > IMA_TEMPLATE_NUM_FIELDS_MAX) { pr_err("format string '%s' contains too many fields\n", template_fmt); return -EINVAL; } for (i = 0, template_fmt_ptr = template_fmt; i < template_num_fields; i++, template_fmt_ptr += len + 1) { char tmp_field_id[IMA_TEMPLATE_FIELD_ID_MAX_LEN + 1]; len = strchrnul(template_fmt_ptr, '|') - template_fmt_ptr; if (len == 0 || len > IMA_TEMPLATE_FIELD_ID_MAX_LEN) { pr_err("Invalid field with length %d\n", len); return -EINVAL; } memcpy(tmp_field_id, template_fmt_ptr, len); tmp_field_id[len] = '\0'; found_fields[i] = lookup_template_field(tmp_field_id); if (!found_fields[i]) { pr_err("field '%s' not found\n", tmp_field_id); return -ENOENT; } } if (fields && num_fields) { *fields = kmalloc_array(i, sizeof(**fields), GFP_KERNEL); if (*fields == NULL) return -ENOMEM; memcpy(*fields, found_fields, i * sizeof(**fields)); *num_fields = i; } return 0; } void ima_init_template_list(void) { int i; if (!list_empty(&defined_templates)) return; spin_lock(&template_list); for (i = 0; i < ARRAY_SIZE(builtin_templates); i++) { list_add_tail_rcu(&builtin_templates[i].list, &defined_templates); } spin_unlock(&template_list); } struct ima_template_desc *ima_template_desc_current(void) { if (!ima_template) { ima_init_template_list(); ima_template = lookup_template_desc(CONFIG_IMA_DEFAULT_TEMPLATE); } return ima_template; } struct ima_template_desc *ima_template_desc_buf(void) { if (!ima_buf_template) { ima_init_template_list(); ima_buf_template = lookup_template_desc("ima-buf"); } return ima_buf_template; } int __init ima_init_template(void) { struct ima_template_desc *template = ima_template_desc_current(); int result; result = template_desc_init_fields(template->fmt, &(template->fields), &(template->num_fields)); if (result < 0) { pr_err("template %s init failed, result: %d\n", (strlen(template->name) ? template->name : template->fmt), result); return result; } template = ima_template_desc_buf(); if (!template) { pr_err("Failed to get ima-buf template\n"); return -EINVAL; } result = template_desc_init_fields(template->fmt, &(template->fields), &(template->num_fields)); if (result < 0) pr_err("template %s init failed, result: %d\n", (strlen(template->name) ? template->name : template->fmt), result); return result; } static struct ima_template_desc *restore_template_fmt(char *template_name) { struct ima_template_desc *template_desc = NULL; int ret; ret = template_desc_init_fields(template_name, NULL, NULL); if (ret < 0) { pr_err("attempting to initialize the template \"%s\" failed\n", template_name); goto out; } template_desc = kzalloc(sizeof(*template_desc), GFP_KERNEL); if (!template_desc) goto out; template_desc->name = ""; template_desc->fmt = kstrdup(template_name, GFP_KERNEL); if (!template_desc->fmt) { kfree(template_desc); template_desc = NULL; goto out; } spin_lock(&template_list); list_add_tail_rcu(&template_desc->list, &defined_templates); spin_unlock(&template_list); out: return template_desc; } static int ima_restore_template_data(struct ima_template_desc *template_desc, void *template_data, int template_data_size, struct ima_template_entry **entry) { struct tpm_digest *digests; int ret = 0; int i; *entry = kzalloc(struct_size(*entry, template_data, template_desc->num_fields), GFP_NOFS); if (!*entry) return -ENOMEM; digests = kcalloc(NR_BANKS(ima_tpm_chip) + ima_extra_slots, sizeof(*digests), GFP_NOFS); if (!digests) { kfree(*entry); return -ENOMEM; } (*entry)->digests = digests; ret = ima_parse_buf(template_data, template_data + template_data_size, NULL, template_desc->num_fields, (*entry)->template_data, NULL, NULL, ENFORCE_FIELDS | ENFORCE_BUFEND, "template data"); if (ret < 0) { kfree((*entry)->digests); kfree(*entry); return ret; } (*entry)->template_desc = template_desc; for (i = 0; i < template_desc->num_fields; i++) { struct ima_field_data *field_data = &(*entry)->template_data[i]; u8 *data = field_data->data; (*entry)->template_data[i].data = kzalloc(field_data->len + 1, GFP_KERNEL); if (!(*entry)->template_data[i].data) { ret = -ENOMEM; break; } memcpy((*entry)->template_data[i].data, data, field_data->len); (*entry)->template_data_len += sizeof(field_data->len); (*entry)->template_data_len += field_data->len; } if (ret < 0) { ima_free_template_entry(*entry); *entry = NULL; } return ret; } /* Restore the serialized binary measurement list without extending PCRs. */ int ima_restore_measurement_list(loff_t size, void *buf) { char template_name[MAX_TEMPLATE_NAME_LEN]; unsigned char zero[TPM_DIGEST_SIZE] = { 0 }; struct ima_kexec_hdr *khdr = buf; struct ima_field_data hdr[HDR__LAST] = { [HDR_PCR] = {.len = sizeof(u32)}, [HDR_DIGEST] = {.len = TPM_DIGEST_SIZE}, }; void *bufp = buf + sizeof(*khdr); void *bufendp; struct ima_template_entry *entry; struct ima_template_desc *template_desc; DECLARE_BITMAP(hdr_mask, HDR__LAST); unsigned long count = 0; int ret = 0; if (!buf || size < sizeof(*khdr)) return 0; if (ima_canonical_fmt) { khdr->version = le16_to_cpu((__force __le16)khdr->version); khdr->count = le64_to_cpu((__force __le64)khdr->count); khdr->buffer_size = le64_to_cpu((__force __le64)khdr->buffer_size); } if (khdr->version != 1) { pr_err("attempting to restore a incompatible measurement list"); return -EINVAL; } if (khdr->count > ULONG_MAX - 1) { pr_err("attempting to restore too many measurements"); return -EINVAL; } bitmap_zero(hdr_mask, HDR__LAST); bitmap_set(hdr_mask, HDR_PCR, 1); bitmap_set(hdr_mask, HDR_DIGEST, 1); /* * ima kexec buffer prefix: version, buffer size, count * v1 format: pcr, digest, template-name-len, template-name, * template-data-size, template-data */ bufendp = buf + khdr->buffer_size; while ((bufp < bufendp) && (count++ < khdr->count)) { int enforce_mask = ENFORCE_FIELDS; enforce_mask |= (count == khdr->count) ? ENFORCE_BUFEND : 0; ret = ima_parse_buf(bufp, bufendp, &bufp, HDR__LAST, hdr, NULL, hdr_mask, enforce_mask, "entry header"); if (ret < 0) break; if (hdr[HDR_TEMPLATE_NAME].len >= MAX_TEMPLATE_NAME_LEN) { pr_err("attempting to restore a template name that is too long\n"); ret = -EINVAL; break; } /* template name is not null terminated */ memcpy(template_name, hdr[HDR_TEMPLATE_NAME].data, hdr[HDR_TEMPLATE_NAME].len); template_name[hdr[HDR_TEMPLATE_NAME].len] = 0; if (strcmp(template_name, "ima") == 0) { pr_err("attempting to restore an unsupported template \"%s\" failed\n", template_name); ret = -EINVAL; break; } template_desc = lookup_template_desc(template_name); if (!template_desc) { template_desc = restore_template_fmt(template_name); if (!template_desc) break; } /* * Only the running system's template format is initialized * on boot. As needed, initialize the other template formats. */ ret = template_desc_init_fields(template_desc->fmt, &(template_desc->fields), &(template_desc->num_fields)); if (ret < 0) { pr_err("attempting to restore the template fmt \"%s\" failed\n", template_desc->fmt); ret = -EINVAL; break; } ret = ima_restore_template_data(template_desc, hdr[HDR_TEMPLATE_DATA].data, hdr[HDR_TEMPLATE_DATA].len, &entry); if (ret < 0) break; if (memcmp(hdr[HDR_DIGEST].data, zero, sizeof(zero))) { ret = ima_calc_field_array_hash( &entry->template_data[0], entry); if (ret < 0) { pr_err("cannot calculate template digest\n"); ret = -EINVAL; break; } } entry->pcr = !ima_canonical_fmt ? *(u32 *)(hdr[HDR_PCR].data) : le32_to_cpu(*(__le32 *)(hdr[HDR_PCR].data)); ret = ima_restore_measurement_entry(entry); if (ret < 0) break; } return ret; }
14 13 14 8 8 8 8 8 8 8 8 8 8 22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include "peer.h" #include "device.h" #include "queueing.h" #include "timers.h" #include "peerlookup.h" #include "noise.h" #include <linux/kref.h> #include <linux/lockdep.h> #include <linux/rcupdate.h> #include <linux/list.h> static struct kmem_cache *peer_cache; static atomic64_t peer_counter = ATOMIC64_INIT(0); struct wg_peer *wg_peer_create(struct wg_device *wg, const u8 public_key[NOISE_PUBLIC_KEY_LEN], const u8 preshared_key[NOISE_SYMMETRIC_KEY_LEN]) { struct wg_peer *peer; int ret = -ENOMEM; lockdep_assert_held(&wg->device_update_lock); if (wg->num_peers >= MAX_PEERS_PER_DEVICE) return ERR_PTR(ret); peer = kmem_cache_zalloc(peer_cache, GFP_KERNEL); if (unlikely(!peer)) return ERR_PTR(ret); if (unlikely(dst_cache_init(&peer->endpoint_cache, GFP_KERNEL))) goto err; peer->device = wg; wg_noise_handshake_init(&peer->handshake, &wg->static_identity, public_key, preshared_key, peer); peer->internal_id = atomic64_inc_return(&peer_counter); peer->serial_work_cpu = nr_cpumask_bits; wg_cookie_init(&peer->latest_cookie); wg_timers_init(peer); wg_cookie_checker_precompute_peer_keys(peer); spin_lock_init(&peer->keypairs.keypair_update_lock); INIT_WORK(&peer->transmit_handshake_work, wg_packet_handshake_send_worker); INIT_WORK(&peer->transmit_packet_work, wg_packet_tx_worker); wg_prev_queue_init(&peer->tx_queue); wg_prev_queue_init(&peer->rx_queue); rwlock_init(&peer->endpoint_lock); kref_init(&peer->refcount); skb_queue_head_init(&peer->staged_packet_queue); wg_noise_reset_last_sent_handshake(&peer->last_sent_handshake); set_bit(NAPI_STATE_NO_BUSY_POLL, &peer->napi.state); netif_napi_add(wg->dev, &peer->napi, wg_packet_rx_poll, NAPI_POLL_WEIGHT); napi_enable(&peer->napi); list_add_tail(&peer->peer_list, &wg->peer_list); INIT_LIST_HEAD(&peer->allowedips_list); wg_pubkey_hashtable_add(wg->peer_hashtable, peer); ++wg->num_peers; pr_debug("%s: Peer %llu created\n", wg->dev->name, peer->internal_id); return peer; err: kmem_cache_free(peer_cache, peer); return ERR_PTR(ret); } struct wg_peer *wg_peer_get_maybe_zero(struct wg_peer *peer) { RCU_LOCKDEP_WARN(!rcu_read_lock_bh_held(), "Taking peer reference without holding the RCU read lock"); if (unlikely(!peer || !kref_get_unless_zero(&peer->refcount))) return NULL; return peer; } static void peer_make_dead(struct wg_peer *peer) { /* Remove from configuration-time lookup structures. */ list_del_init(&peer->peer_list); wg_allowedips_remove_by_peer(&peer->device->peer_allowedips, peer, &peer->device->device_update_lock); wg_pubkey_hashtable_remove(peer->device->peer_hashtable, peer); /* Mark as dead, so that we don't allow jumping contexts after. */ WRITE_ONCE(peer->is_dead, true); /* The caller must now synchronize_net() for this to take effect. */ } static void peer_remove_after_dead(struct wg_peer *peer) { WARN_ON(!peer->is_dead); /* No more keypairs can be created for this peer, since is_dead protects * add_new_keypair, so we can now destroy existing ones. */ wg_noise_keypairs_clear(&peer->keypairs); /* Destroy all ongoing timers that were in-flight at the beginning of * this function. */ wg_timers_stop(peer); /* The transition between packet encryption/decryption queues isn't * guarded by is_dead, but each reference's life is strictly bounded by * two generations: once for parallel crypto and once for serial * ingestion, so we can simply flush twice, and be sure that we no * longer have references inside these queues. */ /* a) For encrypt/decrypt. */ flush_workqueue(peer->device->packet_crypt_wq); /* b.1) For send (but not receive, since that's napi). */ flush_workqueue(peer->device->packet_crypt_wq); /* b.2.1) For receive (but not send, since that's wq). */ napi_disable(&peer->napi); /* b.2.1) It's now safe to remove the napi struct, which must be done * here from process context. */ netif_napi_del(&peer->napi); /* Ensure any workstructs we own (like transmit_handshake_work or * clear_peer_work) no longer are in use. */ flush_workqueue(peer->device->handshake_send_wq); /* After the above flushes, a peer might still be active in a few * different contexts: 1) from xmit(), before hitting is_dead and * returning, 2) from wg_packet_consume_data(), before hitting is_dead * and returning, 3) from wg_receive_handshake_packet() after a point * where it has processed an incoming handshake packet, but where * all calls to pass it off to timers fails because of is_dead. We won't * have new references in (1) eventually, because we're removed from * allowedips; we won't have new references in (2) eventually, because * wg_index_hashtable_lookup will always return NULL, since we removed * all existing keypairs and no more can be created; we won't have new * references in (3) eventually, because we're removed from the pubkey * hash table, which allows for a maximum of one handshake response, * via the still-uncleared index hashtable entry, but not more than one, * and in wg_cookie_message_consume, the lookup eventually gets a peer * with a refcount of zero, so no new reference is taken. */ --peer->device->num_peers; wg_peer_put(peer); } /* We have a separate "remove" function make sure that all active places where * a peer is currently operating will eventually come to an end and not pass * their reference onto another context. */ void wg_peer_remove(struct wg_peer *peer) { if (unlikely(!peer)) return; lockdep_assert_held(&peer->device->device_update_lock); peer_make_dead(peer); synchronize_net(); peer_remove_after_dead(peer); } void wg_peer_remove_all(struct wg_device *wg) { struct wg_peer *peer, *temp; LIST_HEAD(dead_peers); lockdep_assert_held(&wg->device_update_lock); /* Avoid having to traverse individually for each one. */ wg_allowedips_free(&wg->peer_allowedips, &wg->device_update_lock); list_for_each_entry_safe(peer, temp, &wg->peer_list, peer_list) { peer_make_dead(peer); list_add_tail(&peer->peer_list, &dead_peers); } synchronize_net(); list_for_each_entry_safe(peer, temp, &dead_peers, peer_list) peer_remove_after_dead(peer); } static void rcu_release(struct rcu_head *rcu) { struct wg_peer *peer = container_of(rcu, struct wg_peer, rcu); dst_cache_destroy(&peer->endpoint_cache); WARN_ON(wg_prev_queue_peek(&peer->tx_queue) || wg_prev_queue_peek(&peer->rx_queue)); /* The final zeroing takes care of clearing any remaining handshake key * material and other potentially sensitive information. */ memzero_explicit(peer, sizeof(*peer)); kmem_cache_free(peer_cache, peer); } static void kref_release(struct kref *refcount) { struct wg_peer *peer = container_of(refcount, struct wg_peer, refcount); pr_debug("%s: Peer %llu (%pISpfsc) destroyed\n", peer->device->dev->name, peer->internal_id, &peer->endpoint.addr); /* Remove ourself from dynamic runtime lookup structures, now that the * last reference is gone. */ wg_index_hashtable_remove(peer->device->index_hashtable, &peer->handshake.entry); /* Remove any lingering packets that didn't have a chance to be * transmitted. */ wg_packet_purge_staged_packets(peer); /* Free the memory used. */ call_rcu(&peer->rcu, rcu_release); } void wg_peer_put(struct wg_peer *peer) { if (unlikely(!peer)) return; kref_put(&peer->refcount, kref_release); } int __init wg_peer_init(void) { peer_cache = KMEM_CACHE(wg_peer, 0); return peer_cache ? 0 : -ENOMEM; } void wg_peer_uninit(void) { kmem_cache_destroy(peer_cache); }
35 18 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 // SPDX-License-Identifier: GPL-2.0-only /* * VLAN netlink control interface * * Copyright (c) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <linux/module.h> #include <net/net_namespace.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include "vlan.h" static const struct nla_policy vlan_policy[IFLA_VLAN_MAX + 1] = { [IFLA_VLAN_ID] = { .type = NLA_U16 }, [IFLA_VLAN_FLAGS] = { .len = sizeof(struct ifla_vlan_flags) }, [IFLA_VLAN_EGRESS_QOS] = { .type = NLA_NESTED }, [IFLA_VLAN_INGRESS_QOS] = { .type = NLA_NESTED }, [IFLA_VLAN_PROTOCOL] = { .type = NLA_U16 }, }; static const struct nla_policy vlan_map_policy[IFLA_VLAN_QOS_MAX + 1] = { [IFLA_VLAN_QOS_MAPPING] = { .len = sizeof(struct ifla_vlan_qos_mapping) }, }; static inline int vlan_validate_qos_map(struct nlattr *attr) { if (!attr) return 0; return nla_validate_nested_deprecated(attr, IFLA_VLAN_QOS_MAX, vlan_map_policy, NULL); } static int vlan_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ifla_vlan_flags *flags; u16 id; int err; if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) { NL_SET_ERR_MSG_MOD(extack, "Invalid link address"); return -EINVAL; } if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) { NL_SET_ERR_MSG_MOD(extack, "Invalid link address"); return -EADDRNOTAVAIL; } } if (!data) { NL_SET_ERR_MSG_MOD(extack, "VLAN properties not specified"); return -EINVAL; } if (data[IFLA_VLAN_PROTOCOL]) { switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): break; default: NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN protocol"); return -EPROTONOSUPPORT; } } if (data[IFLA_VLAN_ID]) { id = nla_get_u16(data[IFLA_VLAN_ID]); if (id >= VLAN_VID_MASK) { NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN id"); return -ERANGE; } } if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); if ((flags->flags & flags->mask) & ~(VLAN_FLAG_REORDER_HDR | VLAN_FLAG_GVRP | VLAN_FLAG_LOOSE_BINDING | VLAN_FLAG_MVRP | VLAN_FLAG_BRIDGE_BINDING)) { NL_SET_ERR_MSG_MOD(extack, "Invalid VLAN flags"); return -EINVAL; } } err = vlan_validate_qos_map(data[IFLA_VLAN_INGRESS_QOS]); if (err < 0) { NL_SET_ERR_MSG_MOD(extack, "Invalid ingress QOS map"); return err; } err = vlan_validate_qos_map(data[IFLA_VLAN_EGRESS_QOS]); if (err < 0) { NL_SET_ERR_MSG_MOD(extack, "Invalid egress QOS map"); return err; } return 0; } static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct ifla_vlan_flags *flags; struct ifla_vlan_qos_mapping *m; struct nlattr *attr; int rem, err; if (data[IFLA_VLAN_FLAGS]) { flags = nla_data(data[IFLA_VLAN_FLAGS]); err = vlan_dev_change_flags(dev, flags->flags, flags->mask); if (err) return err; } if (data[IFLA_VLAN_INGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) continue; m = nla_data(attr); vlan_dev_set_ingress_priority(dev, m->to, m->from); } } if (data[IFLA_VLAN_EGRESS_QOS]) { nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) continue; m = nla_data(attr); err = vlan_dev_set_egress_priority(dev, m->from, m->to); if (err) return err; } } return 0; } static int vlan_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct net_device *real_dev; unsigned int max_mtu; __be16 proto; int err; if (!data[IFLA_VLAN_ID]) { NL_SET_ERR_MSG_MOD(extack, "VLAN id not specified"); return -EINVAL; } if (!tb[IFLA_LINK]) { NL_SET_ERR_MSG_MOD(extack, "link not specified"); return -EINVAL; } real_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) { NL_SET_ERR_MSG_MOD(extack, "link does not exist"); return -ENODEV; } if (data[IFLA_VLAN_PROTOCOL]) proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]); else proto = htons(ETH_P_8021Q); vlan->vlan_proto = proto; vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]); vlan->real_dev = real_dev; dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE); vlan->flags = VLAN_FLAG_REORDER_HDR; err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id, extack); if (err < 0) return err; max_mtu = netif_reduces_vlan_mtu(real_dev) ? real_dev->mtu - VLAN_HLEN : real_dev->mtu; if (!tb[IFLA_MTU]) dev->mtu = max_mtu; else if (dev->mtu > max_mtu) return -EINVAL; err = vlan_changelink(dev, tb, data, extack); if (err) return err; err = register_vlan_dev(dev, extack); if (err) vlan_dev_free_egress_priority(dev); return err; } static inline size_t vlan_qos_map_size(unsigned int n) { if (n == 0) return 0; /* IFLA_VLAN_{EGRESS,INGRESS}_QOS + n * IFLA_VLAN_QOS_MAPPING */ return nla_total_size(sizeof(struct nlattr)) + nla_total_size(sizeof(struct ifla_vlan_qos_mapping)) * n; } static size_t vlan_get_size(const struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); return nla_total_size(2) + /* IFLA_VLAN_PROTOCOL */ nla_total_size(2) + /* IFLA_VLAN_ID */ nla_total_size(sizeof(struct ifla_vlan_flags)) + /* IFLA_VLAN_FLAGS */ vlan_qos_map_size(vlan->nr_ingress_mappings) + vlan_qos_map_size(vlan->nr_egress_mappings); } static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct vlan_dev_priv *vlan = vlan_dev_priv(dev); struct vlan_priority_tci_mapping *pm; struct ifla_vlan_flags f; struct ifla_vlan_qos_mapping m; struct nlattr *nest; unsigned int i; if (nla_put_be16(skb, IFLA_VLAN_PROTOCOL, vlan->vlan_proto) || nla_put_u16(skb, IFLA_VLAN_ID, vlan->vlan_id)) goto nla_put_failure; if (vlan->flags) { f.flags = vlan->flags; f.mask = ~0; if (nla_put(skb, IFLA_VLAN_FLAGS, sizeof(f), &f)) goto nla_put_failure; } if (vlan->nr_ingress_mappings) { nest = nla_nest_start_noflag(skb, IFLA_VLAN_INGRESS_QOS); if (nest == NULL) goto nla_put_failure; for (i = 0; i < ARRAY_SIZE(vlan->ingress_priority_map); i++) { if (!vlan->ingress_priority_map[i]) continue; m.from = i; m.to = vlan->ingress_priority_map[i]; if (nla_put(skb, IFLA_VLAN_QOS_MAPPING, sizeof(m), &m)) goto nla_put_failure; } nla_nest_end(skb, nest); } if (vlan->nr_egress_mappings) { nest = nla_nest_start_noflag(skb, IFLA_VLAN_EGRESS_QOS); if (nest == NULL) goto nla_put_failure; for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) { for (pm = vlan->egress_priority_map[i]; pm; pm = pm->next) { if (!pm->vlan_qos) continue; m.from = pm->priority; m.to = (pm->vlan_qos >> 13) & 0x7; if (nla_put(skb, IFLA_VLAN_QOS_MAPPING, sizeof(m), &m)) goto nla_put_failure; } } nla_nest_end(skb, nest); } return 0; nla_put_failure: return -EMSGSIZE; } static struct net *vlan_get_link_net(const struct net_device *dev) { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; return dev_net(real_dev); } struct rtnl_link_ops vlan_link_ops __read_mostly = { .kind = "vlan", .maxtype = IFLA_VLAN_MAX, .policy = vlan_policy, .priv_size = sizeof(struct vlan_dev_priv), .setup = vlan_setup, .validate = vlan_validate, .newlink = vlan_newlink, .changelink = vlan_changelink, .dellink = unregister_vlan_dev, .get_size = vlan_get_size, .fill_info = vlan_fill_info, .get_link_net = vlan_get_link_net, }; int __init vlan_netlink_init(void) { return rtnl_link_register(&vlan_link_ops); } void __exit vlan_netlink_fini(void) { rtnl_link_unregister(&vlan_link_ops); } MODULE_ALIAS_RTNL_LINK("vlan");
30 29 1 30 2 15 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 // SPDX-License-Identifier: GPL-2.0-only /* * Network Service Header * * Copyright (c) 2017 Red Hat, Inc. -- Jiri Benc <jbenc@redhat.com> */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/nsh.h> #include <net/tun_proto.h> int nsh_push(struct sk_buff *skb, const struct nshhdr *pushed_nh) { struct nshhdr *nh; size_t length = nsh_hdr_len(pushed_nh); u8 next_proto; if (skb->mac_len) { next_proto = TUN_P_ETHERNET; } else { next_proto = tun_p_from_eth_p(skb->protocol); if (!next_proto) return -EAFNOSUPPORT; } /* Add the NSH header */ if (skb_cow_head(skb, length) < 0) return -ENOMEM; skb_push(skb, length); nh = (struct nshhdr *)(skb->data); memcpy(nh, pushed_nh, length); nh->np = next_proto; skb_postpush_rcsum(skb, nh, length); skb->protocol = htons(ETH_P_NSH); skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_mac_len(skb); return 0; } EXPORT_SYMBOL_GPL(nsh_push); int nsh_pop(struct sk_buff *skb) { struct nshhdr *nh; size_t length; __be16 inner_proto; if (!pskb_may_pull(skb, NSH_BASE_HDR_LEN)) return -ENOMEM; nh = (struct nshhdr *)(skb->data); length = nsh_hdr_len(nh); if (length < NSH_BASE_HDR_LEN) return -EINVAL; inner_proto = tun_p_to_eth_p(nh->np); if (!pskb_may_pull(skb, length)) return -ENOMEM; if (!inner_proto) return -EAFNOSUPPORT; skb_pull_rcsum(skb, length); skb_reset_mac_header(skb); skb_reset_network_header(skb); skb_reset_mac_len(skb); skb->protocol = inner_proto; return 0; } EXPORT_SYMBOL_GPL(nsh_pop); static struct sk_buff *nsh_gso_segment(struct sk_buff *skb, netdev_features_t features) { unsigned int outer_hlen, mac_len, nsh_len; struct sk_buff *segs = ERR_PTR(-EINVAL); u16 mac_offset = skb->mac_header; __be16 outer_proto, proto; skb_reset_network_header(skb); outer_proto = skb->protocol; outer_hlen = skb_mac_header_len(skb); mac_len = skb->mac_len; if (unlikely(!pskb_may_pull(skb, NSH_BASE_HDR_LEN))) goto out; nsh_len = nsh_hdr_len(nsh_hdr(skb)); if (nsh_len < NSH_BASE_HDR_LEN) goto out; if (unlikely(!pskb_may_pull(skb, nsh_len))) goto out; proto = tun_p_to_eth_p(nsh_hdr(skb)->np); if (!proto) goto out; __skb_pull(skb, nsh_len); skb_reset_mac_header(skb); skb->mac_len = proto == htons(ETH_P_TEB) ? ETH_HLEN : 0; skb->protocol = proto; features &= NETIF_F_SG; segs = skb_mac_gso_segment(skb, features); if (IS_ERR_OR_NULL(segs)) { skb_gso_error_unwind(skb, htons(ETH_P_NSH), nsh_len, mac_offset, mac_len); goto out; } for (skb = segs; skb; skb = skb->next) { skb->protocol = outer_proto; __skb_push(skb, nsh_len + outer_hlen); skb_reset_mac_header(skb); skb_set_network_header(skb, outer_hlen); skb->mac_len = mac_len; } out: return segs; } static struct packet_offload nsh_packet_offload __read_mostly = { .type = htons(ETH_P_NSH), .priority = 15, .callbacks = { .gso_segment = nsh_gso_segment, }, }; static int __init nsh_init_module(void) { dev_add_offload(&nsh_packet_offload); return 0; } static void __exit nsh_cleanup_module(void) { dev_remove_offload(&nsh_packet_offload); } module_init(nsh_init_module); module_exit(nsh_cleanup_module); MODULE_AUTHOR("Jiri Benc <jbenc@redhat.com>"); MODULE_DESCRIPTION("NSH protocol"); MODULE_LICENSE("GPL v2");
172 1016 994 22 39 183 146 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 #ifndef _LINUX_JHASH_H #define _LINUX_JHASH_H /* jhash.h: Jenkins hash support. * * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) * * https://burtleburtle.net/bob/hash/ * * These are the credits from Bob's sources: * * lookup3.c, by Bob Jenkins, May 2006, Public Domain. * * These are functions for producing 32-bit hashes for hash table lookup. * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() * are externally useful functions. Routines to test the hash are included * if SELF_TEST is defined. You can use this free for any purpose. It's in * the public domain. It has no warranty. * * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org) * * I've modified Bob's hash to be useful in the Linux kernel, and * any bugs present are my fault. * Jozsef */ #include <linux/bitops.h> #include <linux/unaligned/packed_struct.h> /* Best hash sizes are of power of two */ #define jhash_size(n) ((u32)1<<(n)) /* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */ #define jhash_mask(n) (jhash_size(n)-1) /* __jhash_mix -- mix 3 32-bit values reversibly. */ #define __jhash_mix(a, b, c) \ { \ a -= c; a ^= rol32(c, 4); c += b; \ b -= a; b ^= rol32(a, 6); a += c; \ c -= b; c ^= rol32(b, 8); b += a; \ a -= c; a ^= rol32(c, 16); c += b; \ b -= a; b ^= rol32(a, 19); a += c; \ c -= b; c ^= rol32(b, 4); b += a; \ } /* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */ #define __jhash_final(a, b, c) \ { \ c ^= b; c -= rol32(b, 14); \ a ^= c; a -= rol32(c, 11); \ b ^= a; b -= rol32(a, 25); \ c ^= b; c -= rol32(b, 16); \ a ^= c; a -= rol32(c, 4); \ b ^= a; b -= rol32(a, 14); \ c ^= b; c -= rol32(b, 24); \ } /* An arbitrary initial parameter */ #define JHASH_INITVAL 0xdeadbeef /* jhash - hash an arbitrary key * @k: sequence of bytes as key * @length: the length of the key * @initval: the previous hash, or an arbitray value * * The generic version, hashes an arbitrary sequence of bytes. * No alignment or length assumptions are made about the input key. * * Returns the hash value of the key. The result depends on endianness. */ static inline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; const u8 *k = key; /* Set up the internal state */ a = b = c = JHASH_INITVAL + length + initval; /* All but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += __get_unaligned_cpu32(k); b += __get_unaligned_cpu32(k + 4); c += __get_unaligned_cpu32(k + 8); __jhash_mix(a, b, c); length -= 12; k += 12; } /* Last block: affect all 32 bits of (c) */ switch (length) { case 12: c += (u32)k[11]<<24; fallthrough; case 11: c += (u32)k[10]<<16; fallthrough; case 10: c += (u32)k[9]<<8; fallthrough; case 9: c += k[8]; fallthrough; case 8: b += (u32)k[7]<<24; fallthrough; case 7: b += (u32)k[6]<<16; fallthrough; case 6: b += (u32)k[5]<<8; fallthrough; case 5: b += k[4]; fallthrough; case 4: a += (u32)k[3]<<24; fallthrough; case 3: a += (u32)k[2]<<16; fallthrough; case 2: a += (u32)k[1]<<8; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* jhash2 - hash an array of u32's * @k: the key which must be an array of u32's * @length: the number of u32's in the key * @initval: the previous hash, or an arbitray value * * Returns the hash value of the key. */ static inline u32 jhash2(const u32 *k, u32 length, u32 initval) { u32 a, b, c; /* Set up the internal state */ a = b = c = JHASH_INITVAL + (length<<2) + initval; /* Handle most of the key */ while (length > 3) { a += k[0]; b += k[1]; c += k[2]; __jhash_mix(a, b, c); length -= 3; k += 3; } /* Handle the last 3 u32's */ switch (length) { case 3: c += k[2]; fallthrough; case 2: b += k[1]; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */ static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; b += initval; c += initval; __jhash_final(a, b, c); return c; } static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) { return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2)); } static inline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); } static inline u32 jhash_1word(u32 a, u32 initval) { return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2)); } #endif /* _LINUX_JHASH_H */
47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 2007 Patrick McHardy <kaber@trash.net> */ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/gen_stats.h> #include <linux/jhash.h> #include <linux/rtnetlink.h> #include <linux/random.h> #include <linux/slab.h> #include <net/gen_stats.h> #include <net/netlink.h> #include <net/netns/generic.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_RATEEST.h> #include <net/netfilter/xt_rateest.h> #define RATEEST_HSIZE 16 struct xt_rateest_net { struct mutex hash_lock; struct hlist_head hash[RATEEST_HSIZE]; }; static unsigned int xt_rateest_id; static unsigned int jhash_rnd __read_mostly; static unsigned int xt_rateest_hash(const char *name) { return jhash(name, sizeof_field(struct xt_rateest, name), jhash_rnd) & (RATEEST_HSIZE - 1); } static void xt_rateest_hash_insert(struct xt_rateest_net *xn, struct xt_rateest *est) { unsigned int h; h = xt_rateest_hash(est->name); hlist_add_head(&est->list, &xn->hash[h]); } static struct xt_rateest *__xt_rateest_lookup(struct xt_rateest_net *xn, const char *name) { struct xt_rateest *est; unsigned int h; h = xt_rateest_hash(name); hlist_for_each_entry(est, &xn->hash[h], list) { if (strcmp(est->name, name) == 0) { est->refcnt++; return est; } } return NULL; } struct xt_rateest *xt_rateest_lookup(struct net *net, const char *name) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); struct xt_rateest *est; mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, name); mutex_unlock(&xn->hash_lock); return est; } EXPORT_SYMBOL_GPL(xt_rateest_lookup); void xt_rateest_put(struct net *net, struct xt_rateest *est) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); mutex_lock(&xn->hash_lock); if (--est->refcnt == 0) { hlist_del(&est->list); gen_kill_estimator(&est->rate_est); /* * gen_estimator est_timer() might access est->lock or bstats, * wait a RCU grace period before freeing 'est' */ kfree_rcu(est, rcu); } mutex_unlock(&xn->hash_lock); } EXPORT_SYMBOL_GPL(xt_rateest_put); static unsigned int xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_rateest_target_info *info = par->targinfo; struct gnet_stats_basic_packed *stats = &info->est->bstats; spin_lock_bh(&info->est->lock); stats->bytes += skb->len; stats->packets++; spin_unlock_bh(&info->est->lock); return XT_CONTINUE; } static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) { struct xt_rateest_net *xn = net_generic(par->net, xt_rateest_id); struct xt_rateest_target_info *info = par->targinfo; struct xt_rateest *est; struct { struct nlattr opt; struct gnet_estimator est; } cfg; int ret; if (strnlen(info->name, sizeof(est->name)) >= sizeof(est->name)) return -ENAMETOOLONG; net_get_random_once(&jhash_rnd, sizeof(jhash_rnd)); mutex_lock(&xn->hash_lock); est = __xt_rateest_lookup(xn, info->name); if (est) { mutex_unlock(&xn->hash_lock); /* * If estimator parameters are specified, they must match the * existing estimator. */ if ((!info->interval && !info->ewma_log) || (info->interval != est->params.interval || info->ewma_log != est->params.ewma_log)) { xt_rateest_put(par->net, est); return -EINVAL; } info->est = est; return 0; } ret = -ENOMEM; est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) goto err1; strlcpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; est->params.interval = info->interval; est->params.ewma_log = info->ewma_log; cfg.opt.nla_len = nla_attr_size(sizeof(cfg.est)); cfg.opt.nla_type = TCA_STATS_RATE_EST; cfg.est.interval = info->interval; cfg.est.ewma_log = info->ewma_log; ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est, &est->lock, NULL, &cfg.opt); if (ret < 0) goto err2; info->est = est; xt_rateest_hash_insert(xn, est); mutex_unlock(&xn->hash_lock); return 0; err2: kfree(est); err1: mutex_unlock(&xn->hash_lock); return ret; } static void xt_rateest_tg_destroy(const struct xt_tgdtor_param *par) { struct xt_rateest_target_info *info = par->targinfo; xt_rateest_put(par->net, info->est); } static struct xt_target xt_rateest_tg_reg[] __read_mostly = { { .name = "RATEEST", .revision = 0, .family = NFPROTO_IPV4, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "RATEEST", .revision = 0, .family = NFPROTO_IPV6, .target = xt_rateest_tg, .checkentry = xt_rateest_tg_checkentry, .destroy = xt_rateest_tg_destroy, .targetsize = sizeof(struct xt_rateest_target_info), .usersize = offsetof(struct xt_rateest_target_info, est), .me = THIS_MODULE, }, #endif }; static __net_init int xt_rateest_net_init(struct net *net) { struct xt_rateest_net *xn = net_generic(net, xt_rateest_id); int i; mutex_init(&xn->hash_lock); for (i = 0; i < ARRAY_SIZE(xn->hash); i++) INIT_HLIST_HEAD(&xn->hash[i]); return 0; } static struct pernet_operations xt_rateest_net_ops = { .init = xt_rateest_net_init, .id = &xt_rateest_id, .size = sizeof(struct xt_rateest_net), }; static int __init xt_rateest_tg_init(void) { int err = register_pernet_subsys(&xt_rateest_net_ops); if (err) return err; return xt_register_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); } static void __exit xt_rateest_tg_fini(void) { xt_unregister_targets(xt_rateest_tg_reg, ARRAY_SIZE(xt_rateest_tg_reg)); unregister_pernet_subsys(&xt_rateest_net_ops); } MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: packet rate estimator"); MODULE_ALIAS("ipt_RATEEST"); MODULE_ALIAS("ip6t_RATEEST"); module_init(xt_rateest_tg_init); module_exit(xt_rateest_tg_fini);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 // SPDX-License-Identifier: GPL-2.0-or-later /* * sctp_offload - GRO/GSO Offloading for SCTP * * Copyright (C) 2015, Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/socket.h> #include <linux/sctp.h> #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/module.h> #include <linux/kfifo.h> #include <linux/time.h> #include <net/net_namespace.h> #include <linux/skbuff.h> #include <net/sctp/sctp.h> #include <net/sctp/checksum.h> #include <net/protocol.h> static __le32 sctp_gso_make_checksum(struct sk_buff *skb) { skb->ip_summed = CHECKSUM_NONE; skb->csum_not_inet = 0; /* csum and csum_start in GSO CB may be needed to do the UDP * checksum when it's a UDP tunneling packet. */ SKB_GSO_CB(skb)->csum = (__force __wsum)~0; SKB_GSO_CB(skb)->csum_start = skb_headroom(skb) + skb->len; return sctp_compute_cksum(skb, skb_transport_offset(skb)); } static struct sk_buff *sctp_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct sk_buff *segs = ERR_PTR(-EINVAL); struct sctphdr *sh; if (!skb_is_gso_sctp(skb)) goto out; sh = sctp_hdr(skb); if (!pskb_may_pull(skb, sizeof(*sh))) goto out; __skb_pull(skb, sizeof(*sh)); if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ struct skb_shared_info *pinfo = skb_shinfo(skb); struct sk_buff *frag_iter; pinfo->gso_segs = 0; if (skb->len != skb->data_len) { /* Means we have chunks in here too */ pinfo->gso_segs++; } skb_walk_frags(skb, frag_iter) pinfo->gso_segs++; segs = NULL; goto out; } segs = skb_segment(skb, (features | NETIF_F_HW_CSUM) & ~NETIF_F_SG); if (IS_ERR(segs)) goto out; /* All that is left is update SCTP CRC if necessary */ if (!(features & NETIF_F_SCTP_CRC)) { for (skb = segs; skb; skb = skb->next) { if (skb->ip_summed == CHECKSUM_PARTIAL) { sh = sctp_hdr(skb); sh->checksum = sctp_gso_make_checksum(skb); } } } out: return segs; } static const struct net_offload sctp_offload = { .callbacks = { .gso_segment = sctp_gso_segment, }, }; static const struct net_offload sctp6_offload = { .callbacks = { .gso_segment = sctp_gso_segment, }, }; int __init sctp_offload_init(void) { int ret; ret = inet_add_offload(&sctp_offload, IPPROTO_SCTP); if (ret) goto out; ret = inet6_add_offload(&sctp6_offload, IPPROTO_SCTP); if (ret) goto ipv4; crc32c_csum_stub = &sctp_csum_ops; return ret; ipv4: inet_del_offload(&sctp_offload, IPPROTO_SCTP); out: return ret; }
923 924 143 143 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 #include <linux/init.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <net/net_namespace.h> #include <net/netfilter/nf_tables.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_arp.h> #include <net/netfilter/nf_tables_ipv4.h> #include <net/netfilter/nf_tables_ipv6.h> #ifdef CONFIG_NF_TABLES_IPV4 static unsigned int nft_do_chain_ipv4(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); nft_set_pktinfo_ipv4(&pkt); return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_filter_ipv4 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_IPV4, .hook_mask = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), .hooks = { [NF_INET_LOCAL_IN] = nft_do_chain_ipv4, [NF_INET_LOCAL_OUT] = nft_do_chain_ipv4, [NF_INET_FORWARD] = nft_do_chain_ipv4, [NF_INET_PRE_ROUTING] = nft_do_chain_ipv4, [NF_INET_POST_ROUTING] = nft_do_chain_ipv4, }, }; static void nft_chain_filter_ipv4_init(void) { nft_register_chain_type(&nft_chain_filter_ipv4); } static void nft_chain_filter_ipv4_fini(void) { nft_unregister_chain_type(&nft_chain_filter_ipv4); } #else static inline void nft_chain_filter_ipv4_init(void) {} static inline void nft_chain_filter_ipv4_fini(void) {} #endif /* CONFIG_NF_TABLES_IPV4 */ #ifdef CONFIG_NF_TABLES_ARP static unsigned int nft_do_chain_arp(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); nft_set_pktinfo_unspec(&pkt); return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_filter_arp = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_ARP, .owner = THIS_MODULE, .hook_mask = (1 << NF_ARP_IN) | (1 << NF_ARP_OUT), .hooks = { [NF_ARP_IN] = nft_do_chain_arp, [NF_ARP_OUT] = nft_do_chain_arp, }, }; static void nft_chain_filter_arp_init(void) { nft_register_chain_type(&nft_chain_filter_arp); } static void nft_chain_filter_arp_fini(void) { nft_unregister_chain_type(&nft_chain_filter_arp); } #else static inline void nft_chain_filter_arp_init(void) {} static inline void nft_chain_filter_arp_fini(void) {} #endif /* CONFIG_NF_TABLES_ARP */ #ifdef CONFIG_NF_TABLES_IPV6 static unsigned int nft_do_chain_ipv6(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); nft_set_pktinfo_ipv6(&pkt); return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_filter_ipv6 = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_IPV6, .hook_mask = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), .hooks = { [NF_INET_LOCAL_IN] = nft_do_chain_ipv6, [NF_INET_LOCAL_OUT] = nft_do_chain_ipv6, [NF_INET_FORWARD] = nft_do_chain_ipv6, [NF_INET_PRE_ROUTING] = nft_do_chain_ipv6, [NF_INET_POST_ROUTING] = nft_do_chain_ipv6, }, }; static void nft_chain_filter_ipv6_init(void) { nft_register_chain_type(&nft_chain_filter_ipv6); } static void nft_chain_filter_ipv6_fini(void) { nft_unregister_chain_type(&nft_chain_filter_ipv6); } #else static inline void nft_chain_filter_ipv6_init(void) {} static inline void nft_chain_filter_ipv6_fini(void) {} #endif /* CONFIG_NF_TABLES_IPV6 */ #ifdef CONFIG_NF_TABLES_INET static unsigned int nft_do_chain_inet(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); switch (state->pf) { case NFPROTO_IPV4: nft_set_pktinfo_ipv4(&pkt); break; case NFPROTO_IPV6: nft_set_pktinfo_ipv6(&pkt); break; default: break; } return nft_do_chain(&pkt, priv); } static unsigned int nft_do_chain_inet_ingress(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_hook_state ingress_state = *state; struct nft_pktinfo pkt; switch (skb->protocol) { case htons(ETH_P_IP): /* Original hook is NFPROTO_NETDEV and NF_NETDEV_INGRESS. */ ingress_state.pf = NFPROTO_IPV4; ingress_state.hook = NF_INET_INGRESS; nft_set_pktinfo(&pkt, skb, &ingress_state); if (nft_set_pktinfo_ipv4_ingress(&pkt) < 0) return NF_DROP; break; case htons(ETH_P_IPV6): ingress_state.pf = NFPROTO_IPV6; ingress_state.hook = NF_INET_INGRESS; nft_set_pktinfo(&pkt, skb, &ingress_state); if (nft_set_pktinfo_ipv6_ingress(&pkt) < 0) return NF_DROP; break; default: return NF_ACCEPT; } return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_filter_inet = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_INET, .hook_mask = (1 << NF_INET_INGRESS) | (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_FORWARD) | (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_POST_ROUTING), .hooks = { [NF_INET_INGRESS] = nft_do_chain_inet_ingress, [NF_INET_LOCAL_IN] = nft_do_chain_inet, [NF_INET_LOCAL_OUT] = nft_do_chain_inet, [NF_INET_FORWARD] = nft_do_chain_inet, [NF_INET_PRE_ROUTING] = nft_do_chain_inet, [NF_INET_POST_ROUTING] = nft_do_chain_inet, }, }; static void nft_chain_filter_inet_init(void) { nft_register_chain_type(&nft_chain_filter_inet); } static void nft_chain_filter_inet_fini(void) { nft_unregister_chain_type(&nft_chain_filter_inet); } #else static inline void nft_chain_filter_inet_init(void) {} static inline void nft_chain_filter_inet_fini(void) {} #endif /* CONFIG_NF_TABLES_IPV6 */ #if IS_ENABLED(CONFIG_NF_TABLES_BRIDGE) static unsigned int nft_do_chain_bridge(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); switch (eth_hdr(skb)->h_proto) { case htons(ETH_P_IP): nft_set_pktinfo_ipv4_validate(&pkt); break; case htons(ETH_P_IPV6): nft_set_pktinfo_ipv6_validate(&pkt); break; default: nft_set_pktinfo_unspec(&pkt); break; } return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_filter_bridge = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_BRIDGE, .hook_mask = (1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | (1 << NF_BR_LOCAL_OUT) | (1 << NF_BR_POST_ROUTING), .hooks = { [NF_BR_PRE_ROUTING] = nft_do_chain_bridge, [NF_BR_LOCAL_IN] = nft_do_chain_bridge, [NF_BR_FORWARD] = nft_do_chain_bridge, [NF_BR_LOCAL_OUT] = nft_do_chain_bridge, [NF_BR_POST_ROUTING] = nft_do_chain_bridge, }, }; static void nft_chain_filter_bridge_init(void) { nft_register_chain_type(&nft_chain_filter_bridge); } static void nft_chain_filter_bridge_fini(void) { nft_unregister_chain_type(&nft_chain_filter_bridge); } #else static inline void nft_chain_filter_bridge_init(void) {} static inline void nft_chain_filter_bridge_fini(void) {} #endif /* CONFIG_NF_TABLES_BRIDGE */ #ifdef CONFIG_NF_TABLES_NETDEV static unsigned int nft_do_chain_netdev(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nft_pktinfo pkt; nft_set_pktinfo(&pkt, skb, state); switch (skb->protocol) { case htons(ETH_P_IP): nft_set_pktinfo_ipv4_validate(&pkt); break; case htons(ETH_P_IPV6): nft_set_pktinfo_ipv6_validate(&pkt); break; default: nft_set_pktinfo_unspec(&pkt); break; } return nft_do_chain(&pkt, priv); } static const struct nft_chain_type nft_chain_filter_netdev = { .name = "filter", .type = NFT_CHAIN_T_DEFAULT, .family = NFPROTO_NETDEV, .hook_mask = (1 << NF_NETDEV_INGRESS), .hooks = { [NF_NETDEV_INGRESS] = nft_do_chain_netdev, }, }; static void nft_netdev_event(unsigned long event, struct net_device *dev, struct nft_ctx *ctx) { struct nft_base_chain *basechain = nft_base_chain(ctx->chain); struct nft_hook *hook, *found = NULL; int n = 0; if (event != NETDEV_UNREGISTER) return; list_for_each_entry(hook, &basechain->hook_list, list) { if (hook->ops.dev == dev) found = hook; n++; } if (!found) return; if (n > 1) { if (!(ctx->chain->table->flags & NFT_TABLE_F_DORMANT)) nf_unregister_net_hook(ctx->net, &found->ops); list_del_rcu(&found->list); kfree_rcu(found, rcu); return; } /* UNREGISTER events are also happening on netns exit. * * Although nf_tables core releases all tables/chains, only this event * handler provides guarantee that hook->ops.dev is still accessible, * so we cannot skip exiting net namespaces. */ __nft_release_basechain(ctx); } static int nf_tables_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct nft_base_chain *basechain; struct nftables_pernet *nft_net; struct nft_chain *chain, *nr; struct nft_table *table; struct nft_ctx ctx = { .net = dev_net(dev), }; if (event != NETDEV_UNREGISTER && event != NETDEV_CHANGENAME) return NOTIFY_DONE; nft_net = nft_pernet(ctx.net); mutex_lock(&nft_net->commit_mutex); list_for_each_entry(table, &nft_net->tables, list) { if (table->family != NFPROTO_NETDEV && table->family != NFPROTO_INET) continue; ctx.family = table->family; ctx.table = table; list_for_each_entry_safe(chain, nr, &table->chains, list) { if (!nft_is_base_chain(chain)) continue; basechain = nft_base_chain(chain); if (table->family == NFPROTO_INET && basechain->ops.hooknum != NF_INET_INGRESS) continue; ctx.chain = chain; nft_netdev_event(event, dev, &ctx); } } mutex_unlock(&nft_net->commit_mutex); return NOTIFY_DONE; } static struct notifier_block nf_tables_netdev_notifier = { .notifier_call = nf_tables_netdev_event, }; static int nft_chain_filter_netdev_init(void) { int err; nft_register_chain_type(&nft_chain_filter_netdev); err = register_netdevice_notifier(&nf_tables_netdev_notifier); if (err) goto err_register_netdevice_notifier; return 0; err_register_netdevice_notifier: nft_unregister_chain_type(&nft_chain_filter_netdev); return err; } static void nft_chain_filter_netdev_fini(void) { nft_unregister_chain_type(&nft_chain_filter_netdev); unregister_netdevice_notifier(&nf_tables_netdev_notifier); } #else static inline int nft_chain_filter_netdev_init(void) { return 0; } static inline void nft_chain_filter_netdev_fini(void) {} #endif /* CONFIG_NF_TABLES_NETDEV */ int __init nft_chain_filter_init(void) { int err; err = nft_chain_filter_netdev_init(); if (err < 0) return err; nft_chain_filter_ipv4_init(); nft_chain_filter_ipv6_init(); nft_chain_filter_arp_init(); nft_chain_filter_inet_init(); nft_chain_filter_bridge_init(); return 0; } void nft_chain_filter_fini(void) { nft_chain_filter_bridge_fini(); nft_chain_filter_inet_fini(); nft_chain_filter_arp_fini(); nft_chain_filter_ipv6_fini(); nft_chain_filter_ipv4_fini(); nft_chain_filter_netdev_fini(); }
266 267 155 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H #include <linux/sched/coredump.h> #include <linux/mm_types.h> #include <linux/fs.h> /* only for vma_is_dax() */ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); void huge_pmd_set_accessed(struct vm_fault *vmf); int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); #else static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { } #endif vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf); struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); /** * vmf_insert_pfn_pmd - insert a pmd size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @pgprot: page protection to use * @write: whether it's a write fault * * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) { return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write); } vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); /** * vmf_insert_pfn_pud - insert a pud size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @pgprot: page protection to use * @write: whether it's a write fault * * Insert a pud size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) { return vmf_insert_pfn_pud_prot(vmf, pfn, vmf->vma->vm_page_prot, write); } enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_NEVER_DAX, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, }; struct kobject; struct kobj_attribute; ssize_t single_hugepage_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag); ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) #define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) #define HPAGE_PUD_SHIFT PUD_SHIFT #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) extern unsigned long transparent_hugepage_flags; static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long haddr) { /* Don't have to check pgoff for anonymous vma */ if (!vma_is_anonymous(vma)) { if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR)) return false; } if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return false; return true; } static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, unsigned long vm_flags) { /* Explicitly disabled through madvise. */ if ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; return true; } /* * to be used on vmas which are known to support THP. * Use transparent_hugepage_active otherwise */ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { /* * If the hardware/firmware marked hugepage support disabled. */ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) return false; if (!transhuge_vma_enabled(vma, vma->vm_flags)) return false; if (vma_is_temporary_stack(vma)) return false; if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) return true; if (vma_is_dax(vma)) return true; if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) return !!(vma->vm_flags & VM_HUGEPAGE); return false; } bool transparent_hugepage_active(struct vm_area_struct *vma); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); void prep_transhuge_page(struct page *page); void free_transhuge_page(struct page *page); bool is_transparent_hugepage(struct page *page); bool can_split_huge_page(struct page *page, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false, NULL); \ } while (0) void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page); void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); #define split_huge_pud(__vma, __pud, __address) \ do { \ pud_t *____pud = (__pud); \ if (pud_trans_huge(*____pud) \ || pud_devmap(*____pud)) \ __split_huge_pud(__vma, __pud, __address); \ } while (0) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); static inline int is_swap_pmd(pmd_t pmd) { return !pmd_none(pmd) && !pmd_present(pmd); } /* mmap_lock must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma); else return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { if (pud_trans_huge(*pud) || pud_devmap(*pud)) return __pud_trans_huge_lock(pud, vma); else return NULL; } /** * thp_head - Head page of a transparent huge page. * @page: Any page (tail, head or regular) found in the page cache. */ static inline struct page *thp_head(struct page *page) { return compound_head(page); } /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. */ static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) return HPAGE_PMD_ORDER; return 0; } /** * thp_nr_pages - The number of regular pages in this huge page. * @page: The head page of a huge page. */ static inline int thp_nr_pages(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) return HPAGE_PMD_NR; return 1; } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap); struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap); vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); extern struct page *huge_zero_page; extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_page(struct page *page) { return READ_ONCE(huge_zero_page) == page; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd); } static inline bool is_huge_zero_pud(pud_t pud) { return false; } struct page *mm_get_huge_zero_page(struct mm_struct *mm); void mm_put_huge_zero_page(struct mm_struct *mm); #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } static inline struct list_head *page_deferred_list(struct page *page) { /* * Global or memcg deferred list in the second tail pages is * occupied by compound_head. */ return &page[2].deferred_list; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) static inline struct page *thp_head(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return page; } static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return 0; } static inline int thp_nr_pages(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return 1; } static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { return false; } static inline bool transparent_hugepage_active(struct vm_area_struct *vma) { return false; } static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long haddr) { return false; } static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, unsigned long vm_flags) { return false; } static inline void prep_transhuge_page(struct page *page) {} static inline bool is_transparent_hugepage(struct page *page) { return false; } #define transparent_hugepage_flags 0UL #define thp_get_unmapped_area NULL static inline bool can_split_huge_page(struct page *page, int *pextra_pins) { BUILD_BUG(); return false; } static inline int split_huge_page_to_list(struct page *page, struct list_head *list) { return 0; } static inline int split_huge_page(struct page *page) { return 0; } static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) {} #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { BUG(); return 0; } static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { } static inline int is_swap_pmd(pmd_t pmd) { return 0; } static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { return NULL; } static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { return 0; } static inline bool is_huge_zero_page(struct page *page) { return false; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return false; } static inline bool is_huge_zero_pud(pud_t pud) { return false; } static inline void mm_put_huge_zero_page(struct mm_struct *mm) { return; } static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline bool thp_migration_supported(void) { return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page. */ static inline unsigned long thp_size(struct page *page) { return PAGE_SIZE << thp_order(page); } #endif /* _LINUX_HUGE_MM_H */
837 155 155 155 155 155 155 13 155 155 155 155 155 154 155 5 5 18 18 784 768 779 602 602 602 602 598 563 180 164 414 2 86 415 413 86 416 457 170 170 158 157 170 158 155 770 809 640 640 172 157 153 173 172 83 17 65 588 588 590 586 588 587 586 589 589 589 590 4 588 6 97 492 719 720 722 721 658 721 526 526 547 547 203 14 46 202 199 199 159 159 98 104 2 106 88 2 86 85 704 676 701 703 706 701 124 123 596 596 22 587 587 170 170 170 170 169 169 169 170 168 170 169 170 170 146 263 263 263 85 200 214 168 168 18 18 18 18 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/dir.c - kernfs directory implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/sched.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/security.h> #include <linux/hash.h> #include "kernfs-internal.h" DECLARE_RWSEM(kernfs_rwsem); static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() * will perform wakeups when releasing console_sem. Holding rename_lock * will introduce deadlock if the scheduler reads the kernfs_name in the * wakeup path. */ static DEFINE_SPINLOCK(kernfs_pr_cont_lock); static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) static bool kernfs_active(struct kernfs_node *kn) { lockdep_assert_held(&kernfs_rwsem); return atomic_read(&kn->active) >= 0; } static bool kernfs_lockdep(struct kernfs_node *kn) { #ifdef CONFIG_DEBUG_LOCK_ALLOC return kn->flags & KERNFS_LOCKDEP; #else return false; #endif } static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) { if (!kn) return strlcpy(buf, "(null)", buflen); return strlcpy(buf, kn->parent ? kn->name : "/", buflen); } /* kernfs_node_depth - compute depth from @from to @to */ static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to) { size_t depth = 0; while (to->parent && to != from) { depth++; to = to->parent; } return depth; } static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, struct kernfs_node *b) { size_t da, db; struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b); if (ra != rb) return NULL; da = kernfs_depth(ra->kn, a); db = kernfs_depth(rb->kn, b); while (da > db) { a = a->parent; da--; } while (db > da) { b = b->parent; db--; } /* worst case b and a will be the same at root */ while (b != a) { b = b->parent; a = a->parent; } return a; } /** * kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to, * where kn_from is treated as root of the path. * @kn_from: kernfs node which should be treated as root for the path * @kn_to: kernfs node to which path is needed * @buf: buffer to copy the path into * @buflen: size of @buf * * We need to handle couple of scenarios here: * [1] when @kn_from is an ancestor of @kn_to at some level * kn_from: /n1/n2/n3 * kn_to: /n1/n2/n3/n4/n5 * result: /n4/n5 * * [2] when @kn_from is on a different hierarchy and we need to find common * ancestor between @kn_from and @kn_to. * kn_from: /n1/n2/n3/n4 * kn_to: /n1/n2/n5 * result: /../../n5 * OR * kn_from: /n1/n2/n3/n4/n5 [depth=5] * kn_to: /n1/n2/n3 [depth=3] * result: /../.. * * [3] when @kn_to is NULL result will be "(null)" * * Returns the length of the full path. If the full length is equal to or * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, struct kernfs_node *kn_from, char *buf, size_t buflen) { struct kernfs_node *kn, *common; const char parent_str[] = "/.."; size_t depth_from, depth_to, len = 0; int i, j; if (!kn_to) return strlcpy(buf, "(null)", buflen); if (!kn_from) kn_from = kernfs_root(kn_to)->kn; if (kn_from == kn_to) return strlcpy(buf, "/", buflen); if (!buf) return -EINVAL; common = kernfs_common_ancestor(kn_from, kn_to); if (WARN_ON(!common)) return -EINVAL; depth_to = kernfs_depth(common, kn_to); depth_from = kernfs_depth(common, kn_from); buf[0] = '\0'; for (i = 0; i < depth_from; i++) len += strlcpy(buf + len, parent_str, len < buflen ? buflen - len : 0); /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { for (kn = kn_to, j = 0; j < i; j++) kn = kn->parent; len += strlcpy(buf + len, "/", len < buflen ? buflen - len : 0); len += strlcpy(buf + len, kn->name, len < buflen ? buflen - len : 0); } return len; } /** * kernfs_name - obtain the name of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * Copies the name of @kn into @buf of @buflen bytes. The behavior is * similar to strlcpy(). It returns the length of @kn's name and if @buf * isn't long enough, it's filled upto @buflen-1 and nul terminated. * * Fills buffer with "(null)" if @kn is NULL. * * This function can be called from any context. */ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { unsigned long flags; int ret; spin_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_name_locked(kn, buf, buflen); spin_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } /** * kernfs_path_from_node - build path of node @to relative to @from. * @from: parent kernfs_node relative to which we need to build the path * @to: kernfs_node of interest * @buf: buffer to copy @to's path into * @buflen: size of @buf * * Builds @to's path relative to @from in @buf. @from and @to must * be on the same kernfs-root. If @from is not parent of @to, then a relative * path (which includes '..'s) as needed to reach from @from to @to is * returned. * * Returns the length of the full path. If the full length is equal to or * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) { unsigned long flags; int ret; spin_lock_irqsave(&kernfs_rename_lock, flags); ret = kernfs_path_from_node_locked(to, from, buf, buflen); spin_unlock_irqrestore(&kernfs_rename_lock, flags); return ret; } EXPORT_SYMBOL_GPL(kernfs_path_from_node); /** * pr_cont_kernfs_name - pr_cont name of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_name(struct kernfs_node *kn) { unsigned long flags; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); pr_cont("%s", kernfs_pr_cont_buf); spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * pr_cont_kernfs_path - pr_cont path of a kernfs_node * @kn: kernfs_node of interest * * This function can be called from any context. */ void pr_cont_kernfs_path(struct kernfs_node *kn) { unsigned long flags; int sz; spin_lock_irqsave(&kernfs_pr_cont_lock, flags); sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf)); if (sz < 0) { pr_cont("(error)"); goto out; } if (sz >= sizeof(kernfs_pr_cont_buf)) { pr_cont("(name too long)"); goto out; } pr_cont("%s", kernfs_pr_cont_buf); out: spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags); } /** * kernfs_get_parent - determine the parent node and pin it * @kn: kernfs_node of interest * * Determines @kn's parent, pins and returns it. This function can be * called from any context. */ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { struct kernfs_node *parent; unsigned long flags; spin_lock_irqsave(&kernfs_rename_lock, flags); parent = kn->parent; kernfs_get(parent); spin_unlock_irqrestore(&kernfs_rename_lock, flags); return parent; } /** * kernfs_name_hash * @name: Null terminated string to hash * @ns: Namespace tag to hash * * Returns 31 bit hash of ns + name (so it fits in an off_t ) */ static unsigned int kernfs_name_hash(const char *name, const void *ns) { unsigned long hash = init_name_hash(ns); unsigned int len = strlen(name); while (len--) hash = partial_name_hash(*name++, hash); hash = end_name_hash(hash); hash &= 0x7fffffffU; /* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */ if (hash < 2) hash += 2; if (hash >= INT_MAX) hash = INT_MAX - 1; return hash; } static int kernfs_name_compare(unsigned int hash, const char *name, const void *ns, const struct kernfs_node *kn) { if (hash < kn->hash) return -1; if (hash > kn->hash) return 1; if (ns < kn->ns) return -1; if (ns > kn->ns) return 1; return strcmp(name, kn->name); } static int kernfs_sd_compare(const struct kernfs_node *left, const struct kernfs_node *right) { return kernfs_name_compare(left->hash, left->name, left->ns, right); } /** * kernfs_link_sibling - link kernfs_node into sibling rbtree * @kn: kernfs_node of interest * * Link @kn into its sibling rbtree which starts from * @kn->parent->dir.children. * * Locking: * kernfs_rwsem held exclusive * * RETURNS: * 0 on susccess -EEXIST on failure. */ static int kernfs_link_sibling(struct kernfs_node *kn) { struct rb_node **node = &kn->parent->dir.children.rb_node; struct rb_node *parent = NULL; while (*node) { struct kernfs_node *pos; int result; pos = rb_to_kn(*node); parent = *node; result = kernfs_sd_compare(kn, pos); if (result < 0) node = &pos->rb.rb_left; else if (result > 0) node = &pos->rb.rb_right; else return -EEXIST; } /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); rb_insert_color(&kn->rb, &kn->parent->dir.children); /* successfully added, account subdir number */ if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs++; kernfs_inc_rev(kn->parent); return 0; } /** * kernfs_unlink_sibling - unlink kernfs_node from sibling rbtree * @kn: kernfs_node of interest * * Try to unlink @kn from its sibling rbtree which starts from * kn->parent->dir.children. Returns %true if @kn was actually * removed, %false if @kn wasn't on the rbtree. * * Locking: * kernfs_rwsem held exclusive */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { if (RB_EMPTY_NODE(&kn->rb)) return false; if (kernfs_type(kn) == KERNFS_DIR) kn->parent->dir.subdirs--; kernfs_inc_rev(kn->parent); rb_erase(&kn->rb, &kn->parent->dir.children); RB_CLEAR_NODE(&kn->rb); return true; } /** * kernfs_get_active - get an active reference to kernfs_node * @kn: kernfs_node to get an active reference to * * Get an active reference of @kn. This function is noop if @kn * is NULL. * * RETURNS: * Pointer to @kn on success, NULL on failure. */ struct kernfs_node *kernfs_get_active(struct kernfs_node *kn) { if (unlikely(!kn)) return NULL; if (!atomic_inc_unless_negative(&kn->active)) return NULL; if (kernfs_lockdep(kn)) rwsem_acquire_read(&kn->dep_map, 0, 1, _RET_IP_); return kn; } /** * kernfs_put_active - put an active reference to kernfs_node * @kn: kernfs_node to put an active reference to * * Put an active reference to @kn. This function is noop if @kn * is NULL. */ void kernfs_put_active(struct kernfs_node *kn) { int v; if (unlikely(!kn)) return; if (kernfs_lockdep(kn)) rwsem_release(&kn->dep_map, _RET_IP_); v = atomic_dec_return(&kn->active); if (likely(v != KN_DEACTIVATED_BIAS)) return; wake_up_all(&kernfs_root(kn)->deactivate_waitq); } /** * kernfs_drain - drain kernfs_node * @kn: kernfs_node to drain * * Drain existing usages and nuke all existing mmaps of @kn. Mutiple * removers may invoke this function concurrently on @kn and all will * return after draining is complete. */ static void kernfs_drain(struct kernfs_node *kn) __releases(&kernfs_rwsem) __acquires(&kernfs_rwsem) { struct kernfs_root *root = kernfs_root(kn); lockdep_assert_held_write(&kernfs_rwsem); WARN_ON_ONCE(kernfs_active(kn)); up_write(&kernfs_rwsem); if (kernfs_lockdep(kn)) { rwsem_acquire(&kn->dep_map, 0, 0, _RET_IP_); if (atomic_read(&kn->active) != KN_DEACTIVATED_BIAS) lock_contended(&kn->dep_map, _RET_IP_); } /* but everyone should wait for draining */ wait_event(root->deactivate_waitq, atomic_read(&kn->active) == KN_DEACTIVATED_BIAS); if (kernfs_lockdep(kn)) { lock_acquired(&kn->dep_map, _RET_IP_); rwsem_release(&kn->dep_map, _RET_IP_); } kernfs_drain_open_files(kn); down_write(&kernfs_rwsem); } /** * kernfs_get - get a reference count on a kernfs_node * @kn: the target kernfs_node */ void kernfs_get(struct kernfs_node *kn) { if (kn) { WARN_ON(!atomic_read(&kn->count)); atomic_inc(&kn->count); } } EXPORT_SYMBOL_GPL(kernfs_get); /** * kernfs_put - put a reference count on a kernfs_node * @kn: the target kernfs_node * * Put a reference count of @kn and destroy it if it reached zero. */ void kernfs_put(struct kernfs_node *kn) { struct kernfs_node *parent; struct kernfs_root *root; if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); repeat: /* * Moving/renaming is always done while holding reference. * kn->parent won't change beneath us. */ parent = kn->parent; WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, "kernfs_put: %s/%s: released with incorrect active_ref %d\n", parent ? parent->name : "", kn->name, atomic_read(&kn->active)); if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); kfree_const(kn->name); if (kn->iattr) { simple_xattrs_free(&kn->iattr->xattrs); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); kn = parent; if (kn) { if (atomic_dec_and_test(&kn->count)) goto repeat; } else { /* just released the root kn, free @root too */ idr_destroy(&root->ino_idr); kfree(root); } } EXPORT_SYMBOL_GPL(kernfs_put); /** * kernfs_node_from_dentry - determine kernfs_node associated with a dentry * @dentry: the dentry in question * * Return the kernfs_node associated with @dentry. If @dentry is not a * kernfs one, %NULL is returned. * * While the returned kernfs_node will stay accessible as long as @dentry * is accessible, the returned node can be in any state and the caller is * fully responsible for determining what's accessible. */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { if (dentry->d_sb->s_op == &kernfs_sops) return kernfs_dentry_node(dentry); return NULL; } static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; u32 id_highbits; int ret; name = kstrdup_const(name, GFP_KERNEL); if (!name) return NULL; kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL); if (!kn) goto err_out1; idr_preload(GFP_KERNEL); spin_lock(&kernfs_idr_lock); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); if (ret >= 0 && ret < root->last_id_lowbits) root->id_highbits++; id_highbits = root->id_highbits; root->last_id_lowbits = ret; spin_unlock(&kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; kn->id = (u64)id_highbits << 32 | ret; atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); kn->name = name; kn->mode = mode; kn->flags = flags; if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) { struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, .ia_uid = uid, .ia_gid = gid, }; ret = __kernfs_setattr(kn, &iattr); if (ret < 0) goto err_out3; } if (parent) { ret = security_kernfs_init_security(parent, kn); if (ret) goto err_out3; } return kn; err_out3: spin_lock(&kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); spin_unlock(&kernfs_idr_lock); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: kfree_const(name); return NULL; } struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags) { struct kernfs_node *kn; if (parent->mode & S_ISGID) { /* this code block imitates inode_init_owner() for * kernfs */ if (parent->iattr) gid = parent->iattr->ia_gid; if (flags & KERNFS_DIR) mode |= S_ISGID; } kn = __kernfs_new_node(kernfs_root(parent), parent, name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); kn->parent = parent; } return kn; } /* * kernfs_find_and_get_node_by_id - get kernfs_node from node id * @root: the kernfs root * @id: the target node id * * @id's lower 32bits encode ino and upper gen. If the gen portion is * zero, all generations are matched. * * RETURNS: * NULL on failure. Return a kernfs node with reference counter incremented */ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id) { struct kernfs_node *kn; ino_t ino = kernfs_id_ino(id); u32 gen = kernfs_id_gen(id); spin_lock(&kernfs_idr_lock); kn = idr_find(&root->ino_idr, (u32)ino); if (!kn) goto err_unlock; if (sizeof(ino_t) >= sizeof(u64)) { /* we looked up with the low 32bits, compare the whole */ if (kernfs_ino(kn) != ino) goto err_unlock; } else { /* 0 matches all generations */ if (unlikely(gen && kernfs_gen(kn) != gen)) goto err_unlock; } /* * ACTIVATED is protected with kernfs_mutex but it was clear when * @kn was added to idr and we just wanna see it set. No need to * grab kernfs_mutex. */ if (unlikely(!(kn->flags & KERNFS_ACTIVATED) || !atomic_inc_not_zero(&kn->count))) goto err_unlock; spin_unlock(&kernfs_idr_lock); return kn; err_unlock: spin_unlock(&kernfs_idr_lock); return NULL; } /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added * * The caller must already have initialized @kn->parent. This * function increments nlink of the parent's inode if @kn is a * directory and link into the children list of the parent. * * RETURNS: * 0 on success, -EEXIST if entry with the given name already * exists. */ int kernfs_add_one(struct kernfs_node *kn) { struct kernfs_node *parent = kn->parent; struct kernfs_iattrs *ps_iattr; bool has_ns; int ret; down_write(&kernfs_rwsem); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", parent->name, kn->name)) goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) goto out_unlock; ret = -ENOENT; if (parent->flags & KERNFS_EMPTY_DIR) goto out_unlock; if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) goto out_unlock; kn->hash = kernfs_name_hash(kn->name, kn->ns); ret = kernfs_link_sibling(kn); if (ret) goto out_unlock; /* Update timestamps on the parent */ ps_iattr = parent->iattr; if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } up_write(&kernfs_rwsem); /* * Activate the new node unless CREATE_DEACTIVATED is requested. * If not activated here, the kernfs user is responsible for * activating the node with kernfs_activate(). A node which hasn't * been activated is not visible to userland and its removal won't * trigger deactivation. */ if (!(kernfs_root(kn)->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return 0; out_unlock: up_write(&kernfs_rwsem); return ret; } /** * kernfs_find_ns - find kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent. Returns pointer to * the found kernfs_node on success, %NULL on failure. */ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, const unsigned char *name, const void *ns) { struct rb_node *node = parent->dir.children.rb_node; bool has_ns = kernfs_ns_enabled(parent); unsigned int hash; lockdep_assert_held(&kernfs_rwsem); if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", has_ns ? "required" : "invalid", parent->name, name); return NULL; } hash = kernfs_name_hash(name, ns); while (node) { struct kernfs_node *kn; int result; kn = rb_to_kn(node); result = kernfs_name_compare(hash, name, ns, kn); if (result < 0) node = node->rb_left; else if (result > 0) node = node->rb_right; else return kn; } return NULL; } static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, const unsigned char *path, const void *ns) { size_t len; char *p, *name; lockdep_assert_held_read(&kernfs_rwsem); spin_lock_irq(&kernfs_pr_cont_lock); len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf)); if (len >= sizeof(kernfs_pr_cont_buf)) { spin_unlock_irq(&kernfs_pr_cont_lock); return NULL; } p = kernfs_pr_cont_buf; while ((name = strsep(&p, "/")) && parent) { if (*name == '\0') continue; parent = kernfs_find_ns(parent, name, ns); } spin_unlock_irq(&kernfs_pr_cont_lock); return parent; } /** * kernfs_find_and_get_ns - find and get kernfs_node with the given name * @parent: kernfs_node to search under * @name: name to look for * @ns: the namespace tag to use * * Look for kernfs_node with name @name under @parent and get a reference * if found. This function may sleep and returns pointer to the found * kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; down_read(&kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); kernfs_get(kn); up_read(&kernfs_rwsem); return kn; } EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** * kernfs_walk_and_get_ns - find and get kernfs_node with the given path * @parent: kernfs_node to search under * @path: path to look for * @ns: the namespace tag to use * * Look for kernfs_node with path @path under @parent and get a reference * if found. This function may sleep and returns pointer to the found * kernfs_node on success, %NULL on failure. */ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { struct kernfs_node *kn; down_read(&kernfs_rwsem); kn = kernfs_walk_ns(parent, path, ns); kernfs_get(kn); up_read(&kernfs_rwsem); return kn; } /** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy * @flags: KERNFS_ROOT_* flags * @priv: opaque data associated with the new directory * * Returns the root of the new hierarchy on success, ERR_PTR() value on * failure. */ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { struct kernfs_root *root; struct kernfs_node *kn; root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); idr_init(&root->ino_idr); INIT_LIST_HEAD(&root->supers); /* * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. * High bits generation. The starting value for both ino and * genenration is 1. Initialize upper 32bit allocation * accordingly. */ if (sizeof(ino_t) >= sizeof(u64)) root->id_highbits = 0; else root->id_highbits = 1; kn = __kernfs_new_node(root, NULL, "", S_IFDIR | S_IRUGO | S_IXUGO, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) { idr_destroy(&root->ino_idr); kfree(root); return ERR_PTR(-ENOMEM); } kn->priv = priv; kn->dir.root = root; root->syscall_ops = scops; root->flags = flags; root->kn = kn; init_waitqueue_head(&root->deactivate_waitq); if (!(root->flags & KERNFS_ROOT_CREATE_DEACTIVATED)) kernfs_activate(kn); return root; } /** * kernfs_destroy_root - destroy a kernfs hierarchy * @root: root of the hierarchy to destroy * * Destroy the hierarchy anchored at @root by removing all existing * directories and destroying @root. */ void kernfs_destroy_root(struct kernfs_root *root) { kernfs_remove(root->kn); /* will also free @root */ } /** * kernfs_create_dir_ns - create a directory * @parent: parent in which to create a new directory * @name: name of the new directory * @mode: mode of the new directory * @uid: uid of the new directory * @gid: gid of the new directory * @priv: opaque data associated with the new directory * @ns: optional namespace tag of the directory * * Returns the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, mode | S_IFDIR, uid, gid, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->dir.root = parent->dir.root; kn->ns = ns; kn->priv = priv; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } /** * kernfs_create_empty_dir - create an always empty directory * @parent: parent in which to create a new directory * @name: name of the new directory * * Returns the created node on success, ERR_PTR() value on failure. */ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name) { struct kernfs_node *kn; int rc; /* allocate */ kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, KERNFS_DIR); if (!kn) return ERR_PTR(-ENOMEM); kn->flags |= KERNFS_EMPTY_DIR; kn->dir.root = parent->dir.root; kn->ns = NULL; kn->priv = NULL; /* link in */ rc = kernfs_add_one(kn); if (!rc) return kn; kernfs_put(kn); return ERR_PTR(rc); } static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) { struct kernfs_node *kn; if (flags & LOOKUP_RCU) return -ECHILD; /* Negative hashed dentry? */ if (d_really_is_negative(dentry)) { struct kernfs_node *parent; /* If the kernfs parent node has changed discard and * proceed to ->lookup. */ down_read(&kernfs_rwsem); spin_lock(&dentry->d_lock); parent = kernfs_dentry_node(dentry->d_parent); if (parent) { if (kernfs_dir_changed(parent, dentry)) { spin_unlock(&dentry->d_lock); up_read(&kernfs_rwsem); return 0; } } spin_unlock(&dentry->d_lock); up_read(&kernfs_rwsem); /* The kernfs parent node hasn't changed, leave the * dentry negative and return success. */ return 1; } kn = kernfs_dentry_node(dentry); down_read(&kernfs_rwsem); /* The kernfs node has been deactivated */ if (!kernfs_active(kn)) goto out_bad; /* The kernfs node has been moved? */ if (kernfs_dentry_node(dentry->d_parent) != kn->parent) goto out_bad; /* The kernfs node has been renamed */ if (strcmp(dentry->d_name.name, kn->name) != 0) goto out_bad; /* The kernfs node has been moved to a different namespace */ if (kn->parent && kernfs_ns_enabled(kn->parent) && kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad; up_read(&kernfs_rwsem); return 1; out_bad: up_read(&kernfs_rwsem); return 0; } const struct dentry_operations kernfs_dops = { .d_revalidate = kernfs_dop_revalidate, }; static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; struct inode *inode = NULL; const void *ns = NULL; down_read(&kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dir->i_sb)->ns; kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* attach dentry and inode */ if (kn) { /* Inactive nodes are invisible to the VFS so don't * create a negative. */ if (!kernfs_active(kn)) { up_read(&kernfs_rwsem); return NULL; } inode = kernfs_get_inode(dir->i_sb, kn); if (!inode) inode = ERR_PTR(-ENOMEM); } /* * Needed for negative dentry validation. * The negative dentry can be created in kernfs_iop_lookup() * or transforms from positive dentry in dentry_unlink_inode() * called from vfs_rmdir(). */ if (!IS_ERR(inode)) kernfs_set_rev(parent, dentry); up_read(&kernfs_rwsem); /* instantiate and hash (possibly negative) dentry */ return d_splice_alias(inode, dentry); } static int kernfs_iop_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { struct kernfs_node *parent = dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; int ret; if (!scops || !scops->mkdir) return -EPERM; if (!kernfs_get_active(parent)) return -ENODEV; ret = scops->mkdir(parent, dentry->d_name.name, mode); kernfs_put_active(parent); return ret; } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (!scops || !scops->rmdir) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; ret = scops->rmdir(kn); kernfs_put_active(kn); return ret; } static int kernfs_iop_rename(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct kernfs_node *kn = kernfs_dentry_node(old_dentry); struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; if (flags) return -EINVAL; if (!scops || !scops->rename) return -EPERM; if (!kernfs_get_active(kn)) return -ENODEV; if (!kernfs_get_active(new_parent)) { kernfs_put_active(kn); return -ENODEV; } ret = scops->rename(kn, new_parent, new_dentry->d_name.name); kernfs_put_active(new_parent); kernfs_put_active(kn); return ret; } const struct inode_operations kernfs_dir_iops = { .lookup = kernfs_iop_lookup, .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .listxattr = kernfs_iop_listxattr, .mkdir = kernfs_iop_mkdir, .rmdir = kernfs_iop_rmdir, .rename = kernfs_iop_rename, }; static struct kernfs_node *kernfs_leftmost_descendant(struct kernfs_node *pos) { struct kernfs_node *last; while (true) { struct rb_node *rbn; last = pos; if (kernfs_type(pos) != KERNFS_DIR) break; rbn = rb_first(&pos->dir.children); if (!rbn) break; pos = rb_to_kn(rbn); } return last; } /** * kernfs_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) * @root: kernfs_node whose descendants to walk * * Find the next descendant to visit for post-order traversal of @root's * descendants. @root is included in the iteration and the last node to be * visited. */ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, struct kernfs_node *root) { struct rb_node *rbn; lockdep_assert_held_write(&kernfs_rwsem); /* if first iteration, visit leftmost descendant which may be root */ if (!pos) return kernfs_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) return NULL; /* if there's an unvisited sibling, visit its leftmost descendant */ rbn = rb_next(&pos->rb); if (rbn) return kernfs_leftmost_descendant(rb_to_kn(rbn)); /* no sibling left, visit parent */ return pos->parent; } /** * kernfs_activate - activate a node which started deactivated * @kn: kernfs_node whose subtree is to be activated * * If the root has KERNFS_ROOT_CREATE_DEACTIVATED set, a newly created node * needs to be explicitly activated. A node which hasn't been activated * isn't visible to userland and deactivation is skipped during its * removal. This is useful to construct atomic init sequences where * creation of multiple nodes should either succeed or fail atomically. * * The caller is responsible for ensuring that this function is not called * after kernfs_remove*() is invoked on @kn. */ void kernfs_activate(struct kernfs_node *kn) { struct kernfs_node *pos; down_write(&kernfs_rwsem); pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) { if (pos->flags & KERNFS_ACTIVATED) continue; WARN_ON_ONCE(pos->parent && RB_EMPTY_NODE(&pos->rb)); WARN_ON_ONCE(atomic_read(&pos->active) != KN_DEACTIVATED_BIAS); atomic_sub(KN_DEACTIVATED_BIAS, &pos->active); pos->flags |= KERNFS_ACTIVATED; } up_write(&kernfs_rwsem); } static void __kernfs_remove(struct kernfs_node *kn) { struct kernfs_node *pos; lockdep_assert_held_write(&kernfs_rwsem); /* * Short-circuit if non-root @kn has already finished removal. * This is for kernfs_remove_self() which plays with active ref * after removal. */ if (!kn || (kn->parent && RB_EMPTY_NODE(&kn->rb))) return; pr_debug("kernfs %s: removing\n", kn->name); /* prevent any new usage under @kn by deactivating all nodes */ pos = NULL; while ((pos = kernfs_next_descendant_post(pos, kn))) if (kernfs_active(pos)) atomic_add(KN_DEACTIVATED_BIAS, &pos->active); /* deactivate and unlink the subtree node-by-node */ do { pos = kernfs_leftmost_descendant(kn); /* * kernfs_drain() drops kernfs_rwsem temporarily and @pos's * base ref could have been put by someone else by the time * the function returns. Make sure it doesn't go away * underneath us. */ kernfs_get(pos); /* * Drain iff @kn was activated. This avoids draining and * its lockdep annotations for nodes which have never been * activated and allows embedding kernfs_remove() in create * error paths without worrying about draining. */ if (kn->flags & KERNFS_ACTIVATED) kernfs_drain(pos); else WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); /* * kernfs_unlink_sibling() succeeds once per node. Use it * to decide who's responsible for cleanups. */ if (!pos->parent || kernfs_unlink_sibling(pos)) { struct kernfs_iattrs *ps_iattr = pos->parent ? pos->parent->iattr : NULL; /* update timestamps on the parent */ if (ps_iattr) { ktime_get_real_ts64(&ps_iattr->ia_ctime); ps_iattr->ia_mtime = ps_iattr->ia_ctime; } kernfs_put(pos); } kernfs_put(pos); } while (pos != kn); } /** * kernfs_remove - remove a kernfs_node recursively * @kn: the kernfs_node to remove * * Remove @kn along with all its subdirectories and files. */ void kernfs_remove(struct kernfs_node *kn) { down_write(&kernfs_rwsem); __kernfs_remove(kn); up_write(&kernfs_rwsem); } /** * kernfs_break_active_protection - break out of active protection * @kn: the self kernfs_node * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. Each invocation of * this function must also be matched with an invocation of * kernfs_unbreak_active_protection(). * * This function releases the active reference of @kn the caller is * holding. Once this function is called, @kn may be removed at any point * and the caller is solely responsible for ensuring that the objects it * dereferences are accessible. */ void kernfs_break_active_protection(struct kernfs_node *kn) { /* * Take out ourself out of the active ref dependency chain. If * we're called without an active ref, lockdep will complain. */ kernfs_put_active(kn); } /** * kernfs_unbreak_active_protection - undo kernfs_break_active_protection() * @kn: the self kernfs_node * * If kernfs_break_active_protection() was called, this function must be * invoked before finishing the kernfs operation. Note that while this * function restores the active reference, it doesn't and can't actually * restore the active protection - @kn may already or be in the process of * being removed. Once kernfs_break_active_protection() is invoked, that * protection is irreversibly gone for the kernfs operation instance. * * While this function may be called at any point after * kernfs_break_active_protection() is invoked, its most useful location * would be right before the enclosing kernfs operation returns. */ void kernfs_unbreak_active_protection(struct kernfs_node *kn) { /* * @kn->active could be in any state; however, the increment we do * here will be undone as soon as the enclosing kernfs operation * finishes and this temporary bump can't break anything. If @kn * is alive, nothing changes. If @kn is being deactivated, the * soon-to-follow put will either finish deactivation or restore * deactivated state. If @kn is already removed, the temporary * bump is guaranteed to be gone before @kn is released. */ atomic_inc(&kn->active); if (kernfs_lockdep(kn)) rwsem_acquire(&kn->dep_map, 0, 1, _RET_IP_); } /** * kernfs_remove_self - remove a kernfs_node from its own method * @kn: the self kernfs_node to remove * * The caller must be running off of a kernfs operation which is invoked * with an active reference - e.g. one of kernfs_ops. This can be used to * implement a file operation which deletes itself. * * For example, the "delete" file for a sysfs device directory can be * implemented by invoking kernfs_remove_self() on the "delete" file * itself. This function breaks the circular dependency of trying to * deactivate self while holding an active ref itself. It isn't necessary * to modify the usual removal path to use kernfs_remove_self(). The * "delete" implementation can simply invoke kernfs_remove_self() on self * before proceeding with the usual removal path. kernfs will ignore later * kernfs_remove() on self. * * kernfs_remove_self() can be called multiple times concurrently on the * same kernfs_node. Only the first one actually performs removal and * returns %true. All others will wait until the kernfs operation which * won self-removal finishes and return %false. Note that the losers wait * for the completion of not only the winning kernfs_remove_self() but also * the whole kernfs_ops which won the arbitration. This can be used to * guarantee, for example, all concurrent writes to a "delete" file to * finish only after the whole operation is complete. */ bool kernfs_remove_self(struct kernfs_node *kn) { bool ret; down_write(&kernfs_rwsem); kernfs_break_active_protection(kn); /* * SUICIDAL is used to arbitrate among competing invocations. Only * the first one will actually perform removal. When the removal * is complete, SUICIDED is set and the active ref is restored * while kernfs_rwsem for held exclusive. The ones which lost * arbitration waits for SUICIDED && drained which can happen only * after the enclosing kernfs operation which executed the winning * instance of kernfs_remove_self() finished. */ if (!(kn->flags & KERNFS_SUICIDAL)) { kn->flags |= KERNFS_SUICIDAL; __kernfs_remove(kn); kn->flags |= KERNFS_SUICIDED; ret = true; } else { wait_queue_head_t *waitq = &kernfs_root(kn)->deactivate_waitq; DEFINE_WAIT(wait); while (true) { prepare_to_wait(waitq, &wait, TASK_UNINTERRUPTIBLE); if ((kn->flags & KERNFS_SUICIDED) && atomic_read(&kn->active) == KN_DEACTIVATED_BIAS) break; up_write(&kernfs_rwsem); schedule(); down_write(&kernfs_rwsem); } finish_wait(waitq, &wait); WARN_ON_ONCE(!RB_EMPTY_NODE(&kn->rb)); ret = false; } /* * This must be done while kernfs_rwsem held exclusive; otherwise, * waiting for SUICIDED && deactivated could finish prematurely. */ kernfs_unbreak_active_protection(kn); up_write(&kernfs_rwsem); return ret; } /** * kernfs_remove_by_name_ns - find a kernfs_node by name and remove it * @parent: parent of the target * @name: name of the kernfs_node to remove * @ns: namespace tag of the kernfs_node to remove * * Look for the kernfs_node with @name and @ns under @parent and remove it. * Returns 0 on success, -ENOENT if such entry doesn't exist. */ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns) { struct kernfs_node *kn; if (!parent) { WARN(1, KERN_WARNING "kernfs: can not remove '%s', no directory\n", name); return -ENOENT; } down_write(&kernfs_rwsem); kn = kernfs_find_ns(parent, name, ns); if (kn) { kernfs_get(kn); __kernfs_remove(kn); kernfs_put(kn); } up_write(&kernfs_rwsem); if (kn) return 0; else return -ENOENT; } /** * kernfs_rename_ns - move and rename a kernfs_node * @kn: target node * @new_parent: new parent to put @sd under * @new_name: new name * @new_ns: new namespace tag */ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { struct kernfs_node *old_parent; const char *old_name = NULL; int error; /* can't move or rename root */ if (!kn->parent) return -EINVAL; down_write(&kernfs_rwsem); error = -ENOENT; if (!kernfs_active(kn) || !kernfs_active(new_parent) || (new_parent->flags & KERNFS_EMPTY_DIR)) goto out; error = 0; if ((kn->parent == new_parent) && (kn->ns == new_ns) && (strcmp(kn->name, new_name) == 0)) goto out; /* nothing to rename */ error = -EEXIST; if (kernfs_find_ns(new_parent, new_name, new_ns)) goto out; /* rename kernfs_node */ if (strcmp(kn->name, new_name) != 0) { error = -ENOMEM; new_name = kstrdup_const(new_name, GFP_KERNEL); if (!new_name) goto out; } else { new_name = NULL; } /* * Move to the appropriate place in the appropriate directories rbtree. */ kernfs_unlink_sibling(kn); kernfs_get(new_parent); /* rename_lock protects ->parent and ->name accessors */ spin_lock_irq(&kernfs_rename_lock); old_parent = kn->parent; kn->parent = new_parent; kn->ns = new_ns; if (new_name) { old_name = kn->name; kn->name = new_name; } spin_unlock_irq(&kernfs_rename_lock); kn->hash = kernfs_name_hash(kn->name, kn->ns); kernfs_link_sibling(kn); kernfs_put(old_parent); kfree_const(old_name); error = 0; out: up_write(&kernfs_rwsem); return error; } /* Relationship between mode and the DT_xxx types */ static inline unsigned char dt_type(struct kernfs_node *kn) { return (kn->mode >> 12) & 15; } static int kernfs_dir_fop_release(struct inode *inode, struct file *filp) { kernfs_put(filp->private_data); return 0; } static struct kernfs_node *kernfs_dir_pos(const void *ns, struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos) { if (pos) { int valid = kernfs_active(pos) && pos->parent == parent && hash == pos->hash; kernfs_put(pos); if (!valid) pos = NULL; } if (!pos && (hash > 1) && (hash < INT_MAX)) { struct rb_node *node = parent->dir.children.rb_node; while (node) { pos = rb_to_kn(node); if (hash < pos->hash) node = node->rb_left; else if (hash > pos->hash) node = node->rb_right; else break; } } /* Skip over entries which are dying/dead or in the wrong namespace */ while (pos && (!kernfs_active(pos) || pos->ns != ns)) { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } return pos; } static struct kernfs_node *kernfs_dir_next_pos(const void *ns, struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos) { pos = kernfs_dir_pos(ns, parent, ino, pos); if (pos) { do { struct rb_node *node = rb_next(&pos->rb); if (!node) pos = NULL; else pos = rb_to_kn(node); } while (pos && (!kernfs_active(pos) || pos->ns != ns)); } return pos; } static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; const void *ns = NULL; if (!dir_emit_dots(file, ctx)) return 0; down_read(&kernfs_rwsem); if (kernfs_ns_enabled(parent)) ns = kernfs_info(dentry->d_sb)->ns; for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); pos; pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { const char *name = pos->name; unsigned int type = dt_type(pos); int len = strlen(name); ino_t ino = kernfs_ino(pos); ctx->pos = pos->hash; file->private_data = pos; kernfs_get(pos); up_read(&kernfs_rwsem); if (!dir_emit(ctx, name, len, ino, type)) return 0; down_read(&kernfs_rwsem); } up_read(&kernfs_rwsem); file->private_data = NULL; ctx->pos = INT_MAX; return 0; } const struct file_operations kernfs_dir_fops = { .read = generic_read_dir, .iterate_shared = kernfs_fop_readdir, .release = kernfs_dir_fop_release, .llseek = generic_file_llseek, };
421 5 49 50 105 12 12 1087 261 31 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PAGEMAP_H #define _LINUX_PAGEMAP_H /* * Copyright 1995 Linus Torvalds */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/list.h> #include <linux/highmem.h> #include <linux/compiler.h> #include <linux/uaccess.h> #include <linux/gfp.h> #include <linux/bitops.h> #include <linux/hardirq.h> /* for in_interrupt() */ #include <linux/hugetlb_inline.h> struct pagevec; static inline bool mapping_empty(struct address_space *mapping) { return xa_empty(&mapping->i_pages); } /* * Bits in mapping->flags. */ enum mapping_flags { AS_EIO = 0, /* IO error on async write */ AS_ENOSPC = 1, /* ENOSPC on async write */ AS_MM_ALL_LOCKS = 2, /* under mm_take_all_locks() */ AS_UNEVICTABLE = 3, /* e.g., ramdisk, SHM_LOCK */ AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, AS_THP_SUPPORT = 6, /* THPs supported */ }; /** * mapping_set_error - record a writeback error in the address_space * @mapping: the mapping in which an error should be set * @error: the error to set in the mapping * * When writeback fails in some way, we must record that error so that * userspace can be informed when fsync and the like are called. We endeavor * to report errors on any file that was open at the time of the error. Some * internal callers also need to know when writeback errors have occurred. * * When a writeback error occurs, most filesystems will want to call * mapping_set_error to record the error in the mapping so that it can be * reported when the application calls fsync(2). */ static inline void mapping_set_error(struct address_space *mapping, int error) { if (likely(!error)) return; /* Record in wb_err for checkers using errseq_t based tracking */ __filemap_set_wb_err(mapping, error); /* Record it in superblock */ if (mapping->host) errseq_set(&mapping->host->i_sb->s_wb_err, error); /* Record it in flags for now, for legacy callers */ if (error == -ENOSPC) set_bit(AS_ENOSPC, &mapping->flags); else set_bit(AS_EIO, &mapping->flags); } static inline void mapping_set_unevictable(struct address_space *mapping) { set_bit(AS_UNEVICTABLE, &mapping->flags); } static inline void mapping_clear_unevictable(struct address_space *mapping) { clear_bit(AS_UNEVICTABLE, &mapping->flags); } static inline bool mapping_unevictable(struct address_space *mapping) { return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags); } static inline void mapping_set_exiting(struct address_space *mapping) { set_bit(AS_EXITING, &mapping->flags); } static inline int mapping_exiting(struct address_space *mapping) { return test_bit(AS_EXITING, &mapping->flags); } static inline void mapping_set_no_writeback_tags(struct address_space *mapping) { set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } static inline int mapping_use_writeback_tags(struct address_space *mapping) { return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; } /* Restricts the given gfp_mask to what the mapping allows. */ static inline gfp_t mapping_gfp_constraint(struct address_space *mapping, gfp_t gfp_mask) { return mapping_gfp_mask(mapping) & gfp_mask; } /* * This is non-atomic. Only to be used before the mapping is activated. * Probably needs a barrier... */ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) { m->gfp_mask = mask; } static inline bool mapping_thp_support(struct address_space *mapping) { return test_bit(AS_THP_SUPPORT, &mapping->flags); } static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS return atomic_read(&mapping->nr_thps); #else return 0; #endif } static inline void filemap_nr_thps_inc(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS if (!mapping_thp_support(mapping)) atomic_inc(&mapping->nr_thps); #else WARN_ON_ONCE(1); #endif } static inline void filemap_nr_thps_dec(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS if (!mapping_thp_support(mapping)) atomic_dec(&mapping->nr_thps); #else WARN_ON_ONCE(1); #endif } void release_pages(struct page **pages, int nr); /* * For file cache pages, return the address_space, otherwise return NULL */ static inline struct address_space *page_mapping_file(struct page *page) { if (unlikely(PageSwapCache(page))) return NULL; return page_mapping(page); } /* * speculatively take a reference to a page. * If the page is free (_refcount == 0), then _refcount is untouched, and 0 * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned. * * This function must be called inside the same rcu_read_lock() section as has * been used to lookup the page in the pagecache radix-tree (or page table): * this allows allocators to use a synchronize_rcu() to stabilize _refcount. * * Unless an RCU grace period has passed, the count of all pages coming out * of the allocator must be considered unstable. page_count may return higher * than expected, and put_page must be able to do the right thing when the * page has been finished with, no matter what it is subsequently allocated * for (because put_page is what is used here to drop an invalid speculative * reference). * * This is the interesting part of the lockless pagecache (and lockless * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page) * has the following pattern: * 1. find page in radix tree * 2. conditionally increment refcount * 3. check the page is still in pagecache (if no, goto 1) * * Remove-side that cares about stability of _refcount (eg. reclaim) has the * following (with the i_pages lock held): * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) * B. remove page from pagecache * C. free the page * * There are 2 critical interleavings that matter: * - 2 runs before A: in this case, A sees elevated refcount and bails out * - A runs before 2: in this case, 2 sees zero refcount and retries; * subsequently, B will complete and 1 will find no page, causing the * lookup to return NULL. * * It is possible that between 1 and 2, the page is removed then the exact same * page is inserted into the same position in pagecache. That's OK: the * old find_get_page using a lock could equally have run before or after * such a re-insertion, depending on order that locks are granted. * * Lookups racing against pagecache insertion isn't a big problem: either 1 * will find the page or it will not. Likewise, the old find_get_page could run * either before the insertion or afterwards, depending on timing. */ static inline int __page_cache_add_speculative(struct page *page, int count) { #ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic() && !irqs_disabled()); # endif /* * Preempt must be disabled here - we rely on rcu_read_lock doing * this for us. * * Pagecache won't be truncated from interrupt context, so if we have * found a page in the radix tree here, we have pinned its refcount by * disabling preempt, and hence no need for the "speculative get" that * SMP requires. */ VM_BUG_ON_PAGE(page_count(page) == 0, page); page_ref_add(page, count); #else if (unlikely(!page_ref_add_unless(page, count, 0))) { /* * Either the page has been freed, or will be freed. * In either case, retry here and the caller should * do the right thing (see comments above). */ return 0; } #endif VM_BUG_ON_PAGE(PageTail(page), page); return 1; } static inline int page_cache_get_speculative(struct page *page) { return __page_cache_add_speculative(page, 1); } static inline int page_cache_add_speculative(struct page *page, int count) { return __page_cache_add_speculative(page, count); } /** * attach_page_private - Attach private data to a page. * @page: Page to attach data to. * @data: Data to attach to page. * * Attaching private data to a page increments the page's reference count. * The data must be detached before the page will be freed. */ static inline void attach_page_private(struct page *page, void *data) { get_page(page); set_page_private(page, (unsigned long)data); SetPagePrivate(page); } /** * detach_page_private - Detach private data from a page. * @page: Page to detach data from. * * Removes the data that was previously attached to the page and decrements * the refcount on the page. * * Return: Data that was attached to the page. */ static inline void *detach_page_private(struct page *page) { void *data = (void *)page_private(page); if (!PagePrivate(page)) return NULL; ClearPagePrivate(page); set_page_private(page, 0); put_page(page); return data; } #ifdef CONFIG_NUMA extern struct page *__page_cache_alloc(gfp_t gfp); #else static inline struct page *__page_cache_alloc(gfp_t gfp) { return alloc_pages(gfp, 0); } #endif static inline struct page *page_cache_alloc(struct address_space *x) { return __page_cache_alloc(mapping_gfp_mask(x)); } static inline gfp_t readahead_gfp_mask(struct address_space *x) { return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN; } typedef int filler_t(void *, struct page *); pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); #define FGP_ACCESSED 0x00000001 #define FGP_LOCK 0x00000002 #define FGP_CREAT 0x00000004 #define FGP_WRITE 0x00000008 #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 #define FGP_HEAD 0x00000080 #define FGP_ENTRY 0x00000100 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t cache_gfp_mask); /** * find_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned with an increased refcount. * * Otherwise, %NULL is returned. */ static inline struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { return pagecache_get_page(mapping, offset, 0, 0); } static inline struct page *find_get_page_flags(struct address_space *mapping, pgoff_t offset, int fgp_flags) { return pagecache_get_page(mapping, offset, fgp_flags, 0); } /** * find_lock_page - locate, pin and lock a pagecache page * @mapping: the address_space to search * @index: the page index * * Looks up the page cache entry at @mapping & @index. If there is a * page cache page, it is returned locked and with an increased * refcount. * * Context: May sleep. * Return: A struct page or %NULL if there is no page in the cache for this * index. */ static inline struct page *find_lock_page(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_LOCK, 0); } /** * find_lock_head - Locate, pin and lock a pagecache page. * @mapping: The address_space to search. * @index: The page index. * * Looks up the page cache entry at @mapping & @index. If there is a * page cache page, its head page is returned locked and with an increased * refcount. * * Context: May sleep. * Return: A struct page which is !PageTail, or %NULL if there is no page * in the cache for this index. */ static inline struct page *find_lock_head(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0); } /** * find_or_create_page - locate or add a pagecache page * @mapping: the page's address_space * @index: the page's index into the mapping * @gfp_mask: page allocation mode * * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned locked and with an increased * refcount. * * If the page is not present, a new page is allocated using @gfp_mask * and added to the page cache and the VM's LRU list. The page is * returned locked and with an increased refcount. * * On memory exhaustion, %NULL is returned. * * find_or_create_page() may sleep, even if @gfp_flags specifies an * atomic allocation! */ static inline struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask) { return pagecache_get_page(mapping, index, FGP_LOCK|FGP_ACCESSED|FGP_CREAT, gfp_mask); } /** * grab_cache_page_nowait - returns locked page at given index in given cache * @mapping: target address_space * @index: the page index * * Same as grab_cache_page(), but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should * be safe to call while holding the lock for another page. * * Clear __GFP_FS when allocating the page to avoid recursion into the fs * and deadlock against the caller's locked page. */ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, mapping_gfp_mask(mapping)); } /* Does this page contain this index? */ static inline bool thp_contains(struct page *head, pgoff_t index) { /* HugeTLBfs indexes the page cache in units of hpage_size */ if (PageHuge(head)) return head->index == index; return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL)); } /* * Given the page we found in the page cache, return the page corresponding * to this index in the file */ static inline struct page *find_subpage(struct page *head, pgoff_t index) { /* HugeTLBfs wants the head page regardless */ if (PageHuge(head)) return head; return head + (index & (thp_nr_pages(head) - 1)); } unsigned find_get_entries(struct address_space *mapping, pgoff_t start, pgoff_t end, struct pagevec *pvec, pgoff_t *indices); unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages); static inline unsigned find_get_pages(struct address_space *mapping, pgoff_t *start, unsigned int nr_pages, struct page **pages) { return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages, pages); } unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages); static inline unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, xa_mark_t tag, unsigned int nr_pages, struct page **pages) { return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, nr_pages, pages); } struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags); /* * Returns locked page at given index in given cache, creating it if needed. */ static inline struct page *grab_cache_page(struct address_space *mapping, pgoff_t index) { return find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); } extern struct page * read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern int read_cache_pages(struct address_space *mapping, struct list_head *pages, filler_t *filler, void *data); static inline struct page *read_mapping_page(struct address_space *mapping, pgoff_t index, void *data) { return read_cache_page(mapping, index, NULL, data); } /* * Get index of the page within radix-tree (but not for hugetlb pages). * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) */ static inline pgoff_t page_to_index(struct page *page) { struct page *head; if (likely(!PageTransTail(page))) return page->index; head = compound_head(page); /* * We don't initialize ->index for tail pages: calculate based on * head page */ return head->index + page - head; } extern pgoff_t hugetlb_basepage_index(struct page *page); /* * Get the offset in PAGE_SIZE (even for hugetlb pages). * (TODO: hugetlb pages should have ->index in PAGE_SIZE) */ static inline pgoff_t page_to_pgoff(struct page *page) { if (unlikely(PageHuge(page))) return hugetlb_basepage_index(page); return page_to_index(page); } /* * Return byte-offset into filesystem object for page. */ static inline loff_t page_offset(struct page *page) { return ((loff_t)page->index) << PAGE_SHIFT; } static inline loff_t page_file_offset(struct page *page) { return ((loff_t)page_index(page)) << PAGE_SHIFT; } extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, unsigned long address); static inline pgoff_t linear_page_index(struct vm_area_struct *vma, unsigned long address) { pgoff_t pgoff; if (unlikely(is_vm_hugetlb_page(vma))) return linear_hugepage_index(vma, address); pgoff = (address - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; return pgoff; } struct wait_page_key { struct page *page; int bit_nr; int page_match; }; struct wait_page_queue { struct page *page; int bit_nr; wait_queue_entry_t wait; }; static inline bool wake_page_match(struct wait_page_queue *wait_page, struct wait_page_key *key) { if (wait_page->page != key->page) return false; key->page_match = 1; if (wait_page->bit_nr != key->bit_nr) return false; return true; } extern void __lock_page(struct page *page); extern int __lock_page_killable(struct page *page); extern int __lock_page_async(struct page *page, struct wait_page_queue *wait); extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); extern void unlock_page(struct page *page); /* * Return true if the page was successfully locked */ static inline int trylock_page(struct page *page) { page = compound_head(page); return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); } /* * lock_page may only be called if we have the page's inode pinned. */ static inline void lock_page(struct page *page) { might_sleep(); if (!trylock_page(page)) __lock_page(page); } /* * lock_page_killable is like lock_page but can be interrupted by fatal * signals. It returns 0 if it locked the page and -EINTR if it was * killed while waiting. */ static inline int lock_page_killable(struct page *page) { might_sleep(); if (!trylock_page(page)) return __lock_page_killable(page); return 0; } /* * lock_page_async - Lock the page, unless this would block. If the page * is already locked, then queue a callback when the page becomes unlocked. * This callback can then retry the operation. * * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page * was already locked and the callback defined in 'wait' was queued. */ static inline int lock_page_async(struct page *page, struct wait_page_queue *wait) { if (!trylock_page(page)) return __lock_page_async(page, wait); return 0; } /* * lock_page_or_retry - Lock the page, unless this would block and the * caller indicated that it can handle a retry. * * Return value and mmap_lock implications depend on flags; see * __lock_page_or_retry(). */ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { might_sleep(); return trylock_page(page) || __lock_page_or_retry(page, mm, flags); } /* * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc., * and should not be used directly. */ extern void wait_on_page_bit(struct page *page, int bit_nr); extern int wait_on_page_bit_killable(struct page *page, int bit_nr); /* * Wait for a page to be unlocked. * * This must be called with the caller "holding" the page, * ie with increased "page->count" so that the page won't * go away during the wait.. */ static inline void wait_on_page_locked(struct page *page) { if (PageLocked(page)) wait_on_page_bit(compound_head(page), PG_locked); } static inline int wait_on_page_locked_killable(struct page *page) { if (!PageLocked(page)) return 0; return wait_on_page_bit_killable(compound_head(page), PG_locked); } int put_and_wait_on_page_locked(struct page *page, int state); void wait_on_page_writeback(struct page *page); int wait_on_page_writeback_killable(struct page *page); extern void end_page_writeback(struct page *page); void wait_for_stable_page(struct page *page); void __set_page_dirty(struct page *, struct address_space *, int warn); int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); void page_endio(struct page *page, bool is_write, int err); /** * set_page_private_2 - Set PG_private_2 on a page and take a ref * @page: The page. * * Set the PG_private_2 flag on a page and take the reference needed for the VM * to handle its lifetime correctly. This sets the flag and takes the * reference unconditionally, so care must be taken not to set the flag again * if it's already set. */ static inline void set_page_private_2(struct page *page) { page = compound_head(page); get_page(page); SetPagePrivate2(page); } void end_page_private_2(struct page *page); void wait_on_page_private_2(struct page *page); int wait_on_page_private_2_killable(struct page *page); /* * Add an arbitrary waiter to a page's wait queue */ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); /* * Fault in userspace address range. */ size_t fault_in_writeable(char __user *uaddr, size_t size); size_t fault_in_safe_writeable(const char __user *uaddr, size_t size); size_t fault_in_readable(const char __user *uaddr, size_t size); int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page, void *shadow); void replace_page_cache_page(struct page *old, struct page *new); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); loff_t mapping_seek_hole_data(struct address_space *, loff_t start, loff_t end, int whence); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: * the page is new, so we can just run __SetPageLocked() against it. */ static inline int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { int error; __SetPageLocked(page); error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); if (unlikely(error)) __ClearPageLocked(page); return error; } /** * struct readahead_control - Describes a readahead request. * * A readahead request is for consecutive pages. Filesystems which * implement the ->readahead method should call readahead_page() or * readahead_page_batch() in a loop and attempt to start I/O against * each page in the request. * * Most of the fields in this struct are private and should be accessed * by the functions below. * * @file: The file, used primarily by network filesystems for authentication. * May be NULL if invoked internally by the filesystem. * @mapping: Readahead this filesystem object. * @ra: File readahead state. May be NULL. */ struct readahead_control { struct file *file; struct address_space *mapping; struct file_ra_state *ra; /* private: use the readahead_* accessors instead */ pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; }; #define DEFINE_READAHEAD(ractl, f, r, m, i) \ struct readahead_control ractl = { \ .file = f, \ .mapping = m, \ .ra = r, \ ._index = i, \ } #define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) void page_cache_ra_unbounded(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_count); void page_cache_sync_ra(struct readahead_control *, unsigned long req_count); void page_cache_async_ra(struct readahead_control *, struct page *, unsigned long req_count); void readahead_expand(struct readahead_control *ractl, loff_t new_start, size_t new_len); /** * page_cache_sync_readahead - generic file readahead * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @file: Used by the filesystem for authentication. * @index: Index of first page to be read. * @req_count: Total number of pages being read by the caller. * * page_cache_sync_readahead() should be called when a cache miss happened: * it will submit the read. The readahead logic may decide to piggyback more * pages onto the read request if access patterns suggest it will improve * performance. */ static inline void page_cache_sync_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t index, unsigned long req_count) { DEFINE_READAHEAD(ractl, file, ra, mapping, index); page_cache_sync_ra(&ractl, req_count); } /** * page_cache_async_readahead - file readahead for marked pages * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @file: Used by the filesystem for authentication. * @page: The page at @index which triggered the readahead call. * @index: Index of first page to be read. * @req_count: Total number of pages being read by the caller. * * page_cache_async_readahead() should be called when a page is used which * is marked as PageReadahead; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. */ static inline void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *file, struct page *page, pgoff_t index, unsigned long req_count) { DEFINE_READAHEAD(ractl, file, ra, mapping, index); page_cache_async_ra(&ractl, page, req_count); } /** * readahead_page - Get the next page to read. * @rac: The current readahead request. * * Context: The page is locked and has an elevated refcount. The caller * should decreases the refcount once the page has been submitted for I/O * and unlock the page once all I/O to that page has completed. * Return: A pointer to the next page, or %NULL if we are done. */ static inline struct page *readahead_page(struct readahead_control *rac) { struct page *page; BUG_ON(rac->_batch_count > rac->_nr_pages); rac->_nr_pages -= rac->_batch_count; rac->_index += rac->_batch_count; if (!rac->_nr_pages) { rac->_batch_count = 0; return NULL; } page = xa_load(&rac->mapping->i_pages, rac->_index); VM_BUG_ON_PAGE(!PageLocked(page), page); rac->_batch_count = thp_nr_pages(page); return page; } static inline unsigned int __readahead_batch(struct readahead_control *rac, struct page **array, unsigned int array_sz) { unsigned int i = 0; XA_STATE(xas, &rac->mapping->i_pages, 0); struct page *page; BUG_ON(rac->_batch_count > rac->_nr_pages); rac->_nr_pages -= rac->_batch_count; rac->_index += rac->_batch_count; rac->_batch_count = 0; xas_set(&xas, rac->_index); rcu_read_lock(); xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { if (xas_retry(&xas, page)) continue; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); array[i++] = page; rac->_batch_count += thp_nr_pages(page); /* * The page cache isn't using multi-index entries yet, * so the xas cursor needs to be manually moved to the * next index. This can be removed once the page cache * is converted. */ if (PageHead(page)) xas_set(&xas, rac->_index + rac->_batch_count); if (i == array_sz) break; } rcu_read_unlock(); return i; } /** * readahead_page_batch - Get a batch of pages to read. * @rac: The current readahead request. * @array: An array of pointers to struct page. * * Context: The pages are locked and have an elevated refcount. The caller * should decreases the refcount once the page has been submitted for I/O * and unlock the page once all I/O to that page has completed. * Return: The number of pages placed in the array. 0 indicates the request * is complete. */ #define readahead_page_batch(rac, array) \ __readahead_batch(rac, array, ARRAY_SIZE(array)) /** * readahead_pos - The byte offset into the file of this readahead request. * @rac: The readahead request. */ static inline loff_t readahead_pos(struct readahead_control *rac) { return (loff_t)rac->_index * PAGE_SIZE; } /** * readahead_length - The number of bytes in this readahead request. * @rac: The readahead request. */ static inline size_t readahead_length(struct readahead_control *rac) { return rac->_nr_pages * PAGE_SIZE; } /** * readahead_index - The index of the first page in this readahead request. * @rac: The readahead request. */ static inline pgoff_t readahead_index(struct readahead_control *rac) { return rac->_index; } /** * readahead_count - The number of pages in this readahead request. * @rac: The readahead request. */ static inline unsigned int readahead_count(struct readahead_control *rac) { return rac->_nr_pages; } /** * readahead_batch_length - The number of bytes in the current batch. * @rac: The readahead request. */ static inline size_t readahead_batch_length(struct readahead_control *rac) { return rac->_batch_count * PAGE_SIZE; } static inline unsigned long dir_pages(struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; } /** * page_mkwrite_check_truncate - check if page was truncated * @page: the page to check * @inode: the inode to check the page against * * Returns the number of bytes in the page up to EOF, * or -EFAULT if the page was truncated. */ static inline int page_mkwrite_check_truncate(struct page *page, struct inode *inode) { loff_t size = i_size_read(inode); pgoff_t index = size >> PAGE_SHIFT; int offset = offset_in_page(size); if (page->mapping != inode->i_mapping) return -EFAULT; /* page is wholly inside EOF */ if (page->index < index) return PAGE_SIZE; /* page is wholly past EOF */ if (page->index > index || !offset) return -EFAULT; /* page is partially inside EOF */ return offset; } /** * i_blocks_per_page - How many blocks fit in this page. * @inode: The inode which contains the blocks. * @page: The page (head page if the page is a THP). * * If the block size is larger than the size of this page, return zero. * * Context: The caller should hold a refcount on the page to prevent it * from being split. * Return: The number of filesystem blocks covered by this page. */ static inline unsigned int i_blocks_per_page(struct inode *inode, struct page *page) { return thp_size(page) >> inode->i_blkbits; } #endif /* _LINUX_PAGEMAP_H */
22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 /* SPDX-License-Identifier: GPL-2.0 */ /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Definitions for the SMC module (socket related) * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun <ubraun@linux.vnet.ibm.com> */ #ifndef __SMC_H #define __SMC_H #include <linux/socket.h> #include <linux/types.h> #include <linux/compiler.h> /* __aligned */ #include <net/sock.h> #include "smc_ib.h" #define SMC_V1 1 /* SMC version V1 */ #define SMC_V2 2 /* SMC version V2 */ #define SMC_RELEASE 0 #define SMCPROTO_SMC 0 /* SMC protocol, IPv4 */ #define SMCPROTO_SMC6 1 /* SMC protocol, IPv6 */ #define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM * devices */ #define SMC_MAX_HOSTNAME_LEN 32 #define SMC_MAX_EID_LEN 32 extern struct proto smc_proto; extern struct proto smc_proto6; #ifdef ATOMIC64_INIT #define KERNEL_HAS_ATOMIC64 #endif enum smc_state { /* possible states of an SMC socket */ SMC_ACTIVE = 1, SMC_INIT = 2, SMC_CLOSED = 7, SMC_LISTEN = 10, /* normal close */ SMC_PEERCLOSEWAIT1 = 20, SMC_PEERCLOSEWAIT2 = 21, SMC_APPFINCLOSEWAIT = 24, SMC_APPCLOSEWAIT1 = 22, SMC_APPCLOSEWAIT2 = 23, SMC_PEERFINCLOSEWAIT = 25, /* abnormal close */ SMC_PEERABORTWAIT = 26, SMC_PROCESSABORT = 27, }; struct smc_link_group; struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */ u8 type; } __aligned(1); struct smc_cdc_conn_state_flags { #if defined(__BIG_ENDIAN_BITFIELD) u8 peer_done_writing : 1; /* Sending done indicator */ u8 peer_conn_closed : 1; /* Peer connection closed indicator */ u8 peer_conn_abort : 1; /* Abnormal close indicator */ u8 reserved : 5; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 reserved : 5; u8 peer_conn_abort : 1; u8 peer_conn_closed : 1; u8 peer_done_writing : 1; #endif }; struct smc_cdc_producer_flags { #if defined(__BIG_ENDIAN_BITFIELD) u8 write_blocked : 1; /* Writing Blocked, no rx buf space */ u8 urg_data_pending : 1; /* Urgent Data Pending */ u8 urg_data_present : 1; /* Urgent Data Present */ u8 cons_curs_upd_req : 1; /* cursor update requested */ u8 failover_validation : 1;/* message replay due to failover */ u8 reserved : 3; #elif defined(__LITTLE_ENDIAN_BITFIELD) u8 reserved : 3; u8 failover_validation : 1; u8 cons_curs_upd_req : 1; u8 urg_data_present : 1; u8 urg_data_pending : 1; u8 write_blocked : 1; #endif }; /* in host byte order */ union smc_host_cursor { /* SMC cursor - an offset in an RMBE */ struct { u16 reserved; u16 wrap; /* window wrap sequence number */ u32 count; /* cursor (= offset) part */ }; #ifdef KERNEL_HAS_ATOMIC64 atomic64_t acurs; /* for atomic processing */ #else u64 acurs; /* for atomic processing */ #endif } __aligned(8); /* in host byte order, except for flag bitfields in network byte order */ struct smc_host_cdc_msg { /* Connection Data Control message */ struct smc_wr_rx_hdr common; /* .type = 0xFE */ u8 len; /* length = 44 */ u16 seqno; /* connection seq # */ u32 token; /* alert_token */ union smc_host_cursor prod; /* producer cursor */ union smc_host_cursor cons; /* consumer cursor, * piggy backed "ack" */ struct smc_cdc_producer_flags prod_flags; /* conn. tx/rx status */ struct smc_cdc_conn_state_flags conn_state_flags; /* peer conn. status*/ u8 reserved[18]; } __aligned(8); enum smc_urg_state { SMC_URG_VALID = 1, /* data present */ SMC_URG_NOTYET = 2, /* data pending */ SMC_URG_READ = 3, /* data was already read */ }; struct smc_mark_woken { bool woken; void *key; wait_queue_entry_t wait_entry; }; struct smc_connection { struct rb_node alert_node; struct smc_link_group *lgr; /* link group of connection */ struct smc_link *lnk; /* assigned SMC-R link */ u32 alert_token_local; /* unique conn. id */ u8 peer_rmbe_idx; /* from tcp handshake */ int peer_rmbe_size; /* size of peer rx buffer */ atomic_t peer_rmbe_space;/* remaining free bytes in peer * rmbe */ int rtoken_idx; /* idx to peer RMB rkey/addr */ struct smc_buf_desc *sndbuf_desc; /* send buffer descriptor */ struct smc_buf_desc *rmb_desc; /* RMBE descriptor */ int rmbe_size_short;/* compressed notation */ int rmbe_update_limit; /* lower limit for consumer * cursor update */ struct smc_host_cdc_msg local_tx_ctrl; /* host byte order staging * buffer for CDC msg send * .prod cf. TCP snd_nxt * .cons cf. TCP sends ack */ union smc_host_cursor local_tx_ctrl_fin; /* prod crsr - confirmed by peer */ union smc_host_cursor tx_curs_prep; /* tx - prepared data * snd_max..wmem_alloc */ union smc_host_cursor tx_curs_sent; /* tx - sent data * snd_nxt ? */ union smc_host_cursor tx_curs_fin; /* tx - confirmed by peer * snd-wnd-begin ? */ atomic_t sndbuf_space; /* remaining space in sndbuf */ u16 tx_cdc_seq; /* sequence # for CDC send */ u16 tx_cdc_seq_fin; /* sequence # - tx completed */ spinlock_t send_lock; /* protect wr_sends */ atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe * - inc when post wqe, * - dec on polled tx cqe */ wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/ struct delayed_work tx_work; /* retry of smc_cdc_msg_send */ u32 tx_off; /* base offset in peer rmb */ struct smc_host_cdc_msg local_rx_ctrl; /* filled during event_handl. * .prod cf. TCP rcv_nxt * .cons cf. TCP snd_una */ union smc_host_cursor rx_curs_confirmed; /* confirmed to peer * source of snd_una ? */ union smc_host_cursor urg_curs; /* points at urgent byte */ enum smc_urg_state urg_state; bool urg_tx_pend; /* urgent data staged */ bool urg_rx_skip_pend; /* indicate urgent oob data * read, but previous regular * data still pending */ char urg_rx_byte; /* urgent byte */ atomic_t bytes_to_rcv; /* arrived data, * not yet received */ atomic_t splice_pending; /* number of spliced bytes * pending processing */ #ifndef KERNEL_HAS_ATOMIC64 spinlock_t acurs_lock; /* protect cursors */ #endif struct work_struct close_work; /* peer sent some closing */ struct work_struct abort_work; /* abort the connection */ struct tasklet_struct rx_tsklet; /* Receiver tasklet for SMC-D */ u8 rx_off; /* receive offset: * 0 for SMC-R, 32 for SMC-D */ u64 peer_token; /* SMC-D token of peer */ u8 killed : 1; /* abnormal termination */ u8 out_of_sync : 1; /* out of sync with peer */ }; struct smc_sock { /* smc sock container */ struct sock sk; struct socket *clcsock; /* internal tcp socket */ void (*clcsk_state_change)(struct sock *sk); /* original stat_change fct. */ void (*clcsk_data_ready)(struct sock *sk); /* original data_ready fct. */ void (*clcsk_write_space)(struct sock *sk); /* original write_space fct. */ void (*clcsk_error_report)(struct sock *sk); /* original error_report fct. */ struct smc_connection conn; /* smc connection */ struct smc_sock *listen_smc; /* listen parent */ struct work_struct connect_work; /* handle non-blocking connect*/ struct work_struct tcp_listen_work;/* handle tcp socket accepts */ struct work_struct smc_listen_work;/* prepare new accept socket */ struct list_head accept_q; /* sockets to be accepted */ spinlock_t accept_q_lock; /* protects accept_q */ bool use_fallback; /* fallback to tcp */ int fallback_rsn; /* reason for fallback */ u32 peer_diagnosis; /* decline reason from peer */ atomic_t queued_smc_hs; /* queued smc handshakes */ struct inet_connection_sock_af_ops af_ops; const struct inet_connection_sock_af_ops *ori_af_ops; /* original af ops */ int sockopt_defer_accept; /* sockopt TCP_DEFER_ACCEPT * value */ u8 wait_close_tx_prepared : 1; /* shutdown wr or close * started, waiting for unsent * data to be sent */ u8 connect_nonblock : 1; /* non-blocking connect in * flight */ struct mutex clcsock_release_lock; /* protects clcsock of a listen * socket * */ }; static inline struct smc_sock *smc_sk(const struct sock *sk) { return (struct smc_sock *)sk; } static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk) { return (struct smc_sock *) ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY); } extern struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ extern struct workqueue_struct *smc_close_wq; /* wq for close work */ #define SMC_SYSTEMID_LEN 8 extern u8 local_systemid[SMC_SYSTEMID_LEN]; /* unique system identifier */ #define ntohll(x) be64_to_cpu(x) #define htonll(x) cpu_to_be64(x) /* convert an u32 value into network byte order, store it into a 3 byte field */ static inline void hton24(u8 *net, u32 host) { __be32 t; t = cpu_to_be32(host); memcpy(net, ((u8 *)&t) + 1, 3); } /* convert a received 3 byte field into host byte order*/ static inline u32 ntoh24(u8 *net) { __be32 t = 0; memcpy(((u8 *)&t) + 1, net, 3); return be32_to_cpu(t); } #ifdef CONFIG_XFRM static inline bool using_ipsec(struct smc_sock *smc) { return (smc->clcsock->sk->sk_policy[0] || smc->clcsock->sk->sk_policy[1]) ? true : false; } #else static inline bool using_ipsec(struct smc_sock *smc) { return false; } #endif struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock); void smc_close_non_accepted(struct sock *sk); static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag) { set_bit(flag, &sk->sk_flags); } #endif /* __SMC_H */
19 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> */ #ifndef __IPVLAN_H #define __IPVLAN_H #include <linux/kernel.h> #include <linux/types.h> #include <linux/module.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/if_link.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/inetdevice.h> #include <linux/netfilter.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> #include <net/route.h> #include <net/addrconf.h> #include <net/l3mdev.h> #define IPVLAN_DRV "ipvlan" #define IPV_DRV_VER "0.1" #define IPVLAN_HASH_SIZE (1 << BITS_PER_BYTE) #define IPVLAN_HASH_MASK (IPVLAN_HASH_SIZE - 1) #define IPVLAN_MAC_FILTER_BITS 8 #define IPVLAN_MAC_FILTER_SIZE (1 << IPVLAN_MAC_FILTER_BITS) #define IPVLAN_MAC_FILTER_MASK (IPVLAN_MAC_FILTER_SIZE - 1) #define IPVLAN_QBACKLOG_LIMIT 1000 typedef enum { IPVL_IPV6 = 0, IPVL_ICMPV6, IPVL_IPV4, IPVL_ARP, } ipvl_hdr_type; struct ipvl_pcpu_stats { u64 rx_pkts; u64 rx_bytes; u64 rx_mcast; u64 tx_pkts; u64 tx_bytes; struct u64_stats_sync syncp; u32 rx_errs; u32 tx_drps; }; struct ipvl_port; struct ipvl_dev { struct net_device *dev; struct list_head pnode; struct ipvl_port *port; struct net_device *phy_dev; struct list_head addrs; struct ipvl_pcpu_stats __percpu *pcpu_stats; DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE); netdev_features_t sfeatures; u32 msg_enable; spinlock_t addrs_lock; }; struct ipvl_addr { struct ipvl_dev *master; /* Back pointer to master */ union { struct in6_addr ip6; /* IPv6 address on logical interface */ struct in_addr ip4; /* IPv4 address on logical interface */ } ipu; #define ip6addr ipu.ip6 #define ip4addr ipu.ip4 struct hlist_node hlnode; /* Hash-table linkage */ struct list_head anode; /* logical-interface linkage */ ipvl_hdr_type atype; struct rcu_head rcu; }; struct ipvl_port { struct net_device *dev; possible_net_t pnet; struct hlist_head hlhead[IPVLAN_HASH_SIZE]; struct list_head ipvlans; u16 mode; u16 flags; u16 dev_id_start; struct work_struct wq; struct sk_buff_head backlog; int count; struct ida ida; }; struct ipvl_skb_cb { bool tx_pkt; }; #define IPVL_SKB_CB(_skb) ((struct ipvl_skb_cb *)&((_skb)->cb[0])) static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d) { return rcu_dereference(d->rx_handler_data); } static inline struct ipvl_port *ipvlan_port_get_rcu_bh(const struct net_device *d) { return rcu_dereference_bh(d->rx_handler_data); } static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d) { return rtnl_dereference(d->rx_handler_data); } static inline bool ipvlan_is_private(const struct ipvl_port *port) { return !!(port->flags & IPVLAN_F_PRIVATE); } static inline void ipvlan_mark_private(struct ipvl_port *port) { port->flags |= IPVLAN_F_PRIVATE; } static inline void ipvlan_clear_private(struct ipvl_port *port) { port->flags &= ~IPVLAN_F_PRIVATE; } static inline bool ipvlan_is_vepa(const struct ipvl_port *port) { return !!(port->flags & IPVLAN_F_VEPA); } static inline void ipvlan_mark_vepa(struct ipvl_port *port) { port->flags |= IPVLAN_F_VEPA; } static inline void ipvlan_clear_vepa(struct ipvl_port *port) { port->flags &= ~IPVLAN_F_VEPA; } void ipvlan_init_secret(void); unsigned int ipvlan_mac_hash(const unsigned char *addr); rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb); void ipvlan_process_multicast(struct work_struct *work); int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev); void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr); struct ipvl_addr *ipvlan_find_addr(const struct ipvl_dev *ipvlan, const void *iaddr, bool is_v6); bool ipvlan_addr_busy(struct ipvl_port *port, void *iaddr, bool is_v6); void ipvlan_ht_addr_del(struct ipvl_addr *addr); struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, void *lyr3h, int addr_type, bool use_dest); void *ipvlan_get_L3_hdr(struct ipvl_port *port, struct sk_buff *skb, int *type); void ipvlan_count_rx(const struct ipvl_dev *ipvlan, unsigned int len, bool success, bool mcast); int ipvlan_link_new(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack); void ipvlan_link_delete(struct net_device *dev, struct list_head *head); void ipvlan_link_setup(struct net_device *dev); int ipvlan_link_register(struct rtnl_link_ops *ops); #ifdef CONFIG_IPVLAN_L3S int ipvlan_l3s_register(struct ipvl_port *port); void ipvlan_l3s_unregister(struct ipvl_port *port); void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet); int ipvlan_l3s_init(void); void ipvlan_l3s_cleanup(void); #else static inline int ipvlan_l3s_register(struct ipvl_port *port) { return -ENOTSUPP; } static inline void ipvlan_l3s_unregister(struct ipvl_port *port) { } static inline void ipvlan_migrate_l3s_hook(struct net *oldnet, struct net *newnet) { } static inline int ipvlan_l3s_init(void) { return 0; } static inline void ipvlan_l3s_cleanup(void) { } #endif /* CONFIG_IPVLAN_L3S */ static inline bool netif_is_ipvlan_port(const struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == ipvlan_handle_frame; } #endif /* __IPVLAN_H */
47 47 4 4 4 4 4 4 5 1 4 4 4 4 6 6 5 5 6 6 2 6 5 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 // SPDX-License-Identifier: GPL-2.0-only /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * SMC statistics netlink routines * * Copyright IBM Corp. 2021 * * Author(s): Guvenc Gulce */ #include <linux/init.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/ctype.h> #include <linux/smc.h> #include <net/genetlink.h> #include <net/sock.h> #include "smc_netlink.h" #include "smc_stats.h" int smc_stats_init(struct net *net) { net->smc.fback_rsn = kzalloc(sizeof(*net->smc.fback_rsn), GFP_KERNEL); if (!net->smc.fback_rsn) goto err_fback; net->smc.smc_stats = alloc_percpu(struct smc_stats); if (!net->smc.smc_stats) goto err_stats; mutex_init(&net->smc.mutex_fback_rsn); return 0; err_stats: kfree(net->smc.fback_rsn); err_fback: return -ENOMEM; } void smc_stats_exit(struct net *net) { kfree(net->smc.fback_rsn); if (net->smc.smc_stats) free_percpu(net->smc.smc_stats); } static int smc_nl_fill_stats_rmb_data(struct sk_buff *skb, struct smc_stats *stats, int tech, int type) { struct smc_stats_rmbcnt *stats_rmb_cnt; struct nlattr *attrs; if (type == SMC_NLA_STATS_T_TX_RMB_STATS) stats_rmb_cnt = &stats->smc[tech].rmb_tx; else stats_rmb_cnt = &stats->smc[tech].rmb_rx; attrs = nla_nest_start(skb, type); if (!attrs) goto errout; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_REUSE_CNT, stats_rmb_cnt->reuse_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_PEER_CNT, stats_rmb_cnt->buf_size_small_peer_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_SIZE_SM_CNT, stats_rmb_cnt->buf_size_small_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_PEER_CNT, stats_rmb_cnt->buf_full_peer_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_FULL_CNT, stats_rmb_cnt->buf_full_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_ALLOC_CNT, stats_rmb_cnt->alloc_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_RMB_DGRADE_CNT, stats_rmb_cnt->dgrade_cnt, SMC_NLA_STATS_RMB_PAD)) goto errattr; nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } static int smc_nl_fill_stats_bufsize_data(struct sk_buff *skb, struct smc_stats *stats, int tech, int type) { struct smc_stats_memsize *stats_pload; struct nlattr *attrs; if (type == SMC_NLA_STATS_T_TXPLOAD_SIZE) stats_pload = &stats->smc[tech].tx_pd; else if (type == SMC_NLA_STATS_T_RXPLOAD_SIZE) stats_pload = &stats->smc[tech].rx_pd; else if (type == SMC_NLA_STATS_T_TX_RMB_SIZE) stats_pload = &stats->smc[tech].tx_rmbsize; else if (type == SMC_NLA_STATS_T_RX_RMB_SIZE) stats_pload = &stats->smc[tech].rx_rmbsize; else goto errout; attrs = nla_nest_start(skb, type); if (!attrs) goto errout; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_8K, stats_pload->buf[SMC_BUF_8K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_16K, stats_pload->buf[SMC_BUF_16K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_32K, stats_pload->buf[SMC_BUF_32K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_64K, stats_pload->buf[SMC_BUF_64K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_128K, stats_pload->buf[SMC_BUF_128K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_256K, stats_pload->buf[SMC_BUF_256K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_512K, stats_pload->buf[SMC_BUF_512K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_1024K, stats_pload->buf[SMC_BUF_1024K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_PLOAD_G_1024K, stats_pload->buf[SMC_BUF_G_1024K], SMC_NLA_STATS_PLOAD_PAD)) goto errattr; nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } static int smc_nl_fill_stats_tech_data(struct sk_buff *skb, struct smc_stats *stats, int tech) { struct smc_stats_tech *smc_tech; struct nlattr *attrs; smc_tech = &stats->smc[tech]; if (tech == SMC_TYPE_D) attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCD_TECH); else attrs = nla_nest_start(skb, SMC_NLA_STATS_SMCR_TECH); if (!attrs) goto errout; if (smc_nl_fill_stats_rmb_data(skb, stats, tech, SMC_NLA_STATS_T_TX_RMB_STATS)) goto errattr; if (smc_nl_fill_stats_rmb_data(skb, stats, tech, SMC_NLA_STATS_T_RX_RMB_STATS)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_TXPLOAD_SIZE)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_RXPLOAD_SIZE)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_TX_RMB_SIZE)) goto errattr; if (smc_nl_fill_stats_bufsize_data(skb, stats, tech, SMC_NLA_STATS_T_RX_RMB_SIZE)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V1_SUCC, smc_tech->clnt_v1_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CLNT_V2_SUCC, smc_tech->clnt_v2_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V1_SUCC, smc_tech->srv_v1_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SRV_V2_SUCC, smc_tech->srv_v2_succ_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_BYTES, smc_tech->rx_bytes, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_BYTES, smc_tech->tx_bytes, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_RX_CNT, smc_tech->rx_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_TX_CNT, smc_tech->tx_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SENDPAGE_CNT, smc_tech->sendpage_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_CORK_CNT, smc_tech->cork_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_NDLY_CNT, smc_tech->ndly_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_SPLICE_CNT, smc_tech->splice_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_T_URG_DATA_CNT, smc_tech->urg_data_cnt, SMC_NLA_STATS_PAD)) goto errattr; nla_nest_end(skb, attrs); return 0; errattr: nla_nest_cancel(skb, attrs); errout: return -EMSGSIZE; } int smc_nl_get_stats(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct net *net = sock_net(skb->sk); struct smc_stats *stats; struct nlattr *attrs; int cpu, i, size; void *nlh; u64 *src; u64 *sum; if (cb_ctx->pos[0]) goto errmsg; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_STATS); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_STATS); if (!attrs) goto errnest; stats = kzalloc(sizeof(*stats), GFP_KERNEL); if (!stats) goto erralloc; size = sizeof(*stats) / sizeof(u64); for_each_possible_cpu(cpu) { src = (u64 *)per_cpu_ptr(net->smc.smc_stats, cpu); sum = (u64 *)stats; for (i = 0; i < size; i++) *(sum++) += *(src++); } if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_D)) goto errattr; if (smc_nl_fill_stats_tech_data(skb, stats, SMC_TYPE_R)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_CLNT_HS_ERR_CNT, stats->clnt_hshake_err_cnt, SMC_NLA_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_STATS_SRV_HS_ERR_CNT, stats->srv_hshake_err_cnt, SMC_NLA_STATS_PAD)) goto errattr; nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); cb_ctx->pos[0] = 1; kfree(stats); return skb->len; errattr: kfree(stats); erralloc: nla_nest_cancel(skb, attrs); errnest: genlmsg_cancel(skb, nlh); errmsg: return skb->len; } static int smc_nl_get_fback_details(struct sk_buff *skb, struct netlink_callback *cb, int pos, bool is_srv) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct net *net = sock_net(skb->sk); int cnt_reported = cb_ctx->pos[2]; struct smc_stats_fback *trgt_arr; struct nlattr *attrs; int rc = 0; void *nlh; if (is_srv) trgt_arr = &net->smc.fback_rsn->srv[0]; else trgt_arr = &net->smc.fback_rsn->clnt[0]; if (!trgt_arr[pos].fback_code) return -ENODATA; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_FBACK_STATS); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_FBACK_STATS); if (!attrs) goto errout; if (nla_put_u8(skb, SMC_NLA_FBACK_STATS_TYPE, is_srv)) goto errattr; if (!cnt_reported) { if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_SRV_CNT, net->smc.fback_rsn->srv_fback_cnt, SMC_NLA_FBACK_STATS_PAD)) goto errattr; if (nla_put_u64_64bit(skb, SMC_NLA_FBACK_STATS_CLNT_CNT, net->smc.fback_rsn->clnt_fback_cnt, SMC_NLA_FBACK_STATS_PAD)) goto errattr; cnt_reported = 1; } if (nla_put_u32(skb, SMC_NLA_FBACK_STATS_RSN_CODE, trgt_arr[pos].fback_code)) goto errattr; if (nla_put_u16(skb, SMC_NLA_FBACK_STATS_RSN_CNT, trgt_arr[pos].count)) goto errattr; cb_ctx->pos[2] = cnt_reported; nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return rc; errattr: nla_nest_cancel(skb, attrs); errout: genlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } int smc_nl_get_fback_stats(struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); struct net *net = sock_net(skb->sk); int rc_srv = 0, rc_clnt = 0, k; int skip_serv = cb_ctx->pos[1]; int snum = cb_ctx->pos[0]; bool is_srv = true; mutex_lock(&net->smc.mutex_fback_rsn); for (k = 0; k < SMC_MAX_FBACK_RSN_CNT; k++) { if (k < snum) continue; if (!skip_serv) { rc_srv = smc_nl_get_fback_details(skb, cb, k, is_srv); if (rc_srv && rc_srv != -ENODATA) break; } else { skip_serv = 0; } rc_clnt = smc_nl_get_fback_details(skb, cb, k, !is_srv); if (rc_clnt && rc_clnt != -ENODATA) { skip_serv = 1; break; } if (rc_clnt == -ENODATA && rc_srv == -ENODATA) break; } mutex_unlock(&net->smc.mutex_fback_rsn); cb_ctx->pos[1] = skip_serv; cb_ctx->pos[0] = k; return skb->len; }
818 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fs.h> #define DEVCG_ACC_MKNOD 1 #define DEVCG_ACC_READ 2 #define DEVCG_ACC_WRITE 4 #define DEVCG_ACC_MASK (DEVCG_ACC_MKNOD | DEVCG_ACC_READ | DEVCG_ACC_WRITE) #define DEVCG_DEV_BLOCK 1 #define DEVCG_DEV_CHAR 2 #define DEVCG_DEV_ALL 4 /* this represents all devices */ #if defined(CONFIG_CGROUP_DEVICE) || defined(CONFIG_CGROUP_BPF) int devcgroup_check_permission(short type, u32 major, u32 minor, short access); static inline int devcgroup_inode_permission(struct inode *inode, int mask) { short type, access = 0; if (likely(!inode->i_rdev)) return 0; if (S_ISBLK(inode->i_mode)) type = DEVCG_DEV_BLOCK; else if (S_ISCHR(inode->i_mode)) type = DEVCG_DEV_CHAR; else return 0; if (mask & MAY_WRITE) access |= DEVCG_ACC_WRITE; if (mask & MAY_READ) access |= DEVCG_ACC_READ; return devcgroup_check_permission(type, imajor(inode), iminor(inode), access); } static inline int devcgroup_inode_mknod(int mode, dev_t dev) { short type; if (!S_ISBLK(mode) && !S_ISCHR(mode)) return 0; if (S_ISCHR(mode) && dev == WHITEOUT_DEV) return 0; if (S_ISBLK(mode)) type = DEVCG_DEV_BLOCK; else type = DEVCG_DEV_CHAR; return devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), DEVCG_ACC_MKNOD); } #else static inline int devcgroup_check_permission(short type, u32 major, u32 minor, short access) { return 0; } static inline int devcgroup_inode_permission(struct inode *inode, int mask) { return 0; } static inline int devcgroup_inode_mknod(int mode, dev_t dev) { return 0; } #endif
6 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM block #if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BLOCK_H #include <linux/blktrace_api.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/tracepoint.h> #define RWBS_LEN 8 DECLARE_EVENT_CLASS(block_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh), TP_STRUCT__entry ( __field( dev_t, dev ) __field( sector_t, sector ) __field( size_t, size ) ), TP_fast_assign( __entry->dev = bh->b_bdev->bd_dev; __entry->sector = bh->b_blocknr; __entry->size = bh->b_size; ), TP_printk("%d,%d sector=%llu size=%zu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->sector, __entry->size ) ); /** * block_touch_buffer - mark a buffer accessed * @bh: buffer_head being touched * * Called from touch_buffer(). */ DEFINE_EVENT(block_buffer, block_touch_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh) ); /** * block_dirty_buffer - mark a buffer dirty * @bh: buffer_head being dirtied * * Called from mark_buffer_dirty(). */ DEFINE_EVENT(block_buffer, block_dirty_buffer, TP_PROTO(struct buffer_head *bh), TP_ARGS(bh) ); /** * block_rq_requeue - place block IO request back on a queue * @rq: block IO operation request * * The block operation request @rq is being placed back into queue * @q. For some reason the request was not completed and needs to be * put back in the queue. */ TRACE_EVENT(block_rq_requeue, TP_PROTO(struct request *rq), TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, 0) ); /** * block_rq_complete - block IO operation completed by device driver * @rq: block operations request * @error: status code * @nr_bytes: number of completed bytes * * The block_rq_complete tracepoint event indicates that some portion * of operation request has been completed by the device driver. If * the @rq->bio is %NULL, then there is absolutely no additional work to * do for the request. If @rq->bio is non-NULL then there is * additional work required to complete the request. */ TRACE_EVENT(block_rq_complete, TP_PROTO(struct request *rq, int error, unsigned int nr_bytes), TP_ARGS(rq, error, nr_bytes), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( int, error ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; __entry->error = error; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; ), TP_printk("%d,%d %s (%s) %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __entry->error) ); DECLARE_EVENT_CLASS(block_rq, TP_PROTO(struct request *rq), TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( unsigned int, bytes ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) __dynamic_array( char, cmd, 1 ) ), TP_fast_assign( __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); __entry->bytes = blk_rq_bytes(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); __get_str(cmd)[0] = '\0'; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %u (%s) %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __entry->bytes, __get_str(cmd), (unsigned long long)__entry->sector, __entry->nr_sector, __entry->comm) ); /** * block_rq_insert - insert block operation request into queue * @rq: block IO operation request * * Called immediately before block operation request @rq is inserted * into queue @q. The fields in the operation request @rq struct can * be examined to determine which device and sectors the pending * operation would access. */ DEFINE_EVENT(block_rq, block_rq_insert, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_rq_issue - issue pending block IO request operation to device driver * @rq: block IO operation request * * Called when block operation request @rq from queue @q is sent to a * device driver for processing. */ DEFINE_EVENT(block_rq, block_rq_issue, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_rq_merge - merge request with another one in the elevator * @rq: block IO operation request * * Called when block operation request @rq from queue @q is merged to another * request queued in the elevator. */ DEFINE_EVENT(block_rq, block_rq_merge, TP_PROTO(struct request *rq), TP_ARGS(rq) ); /** * block_bio_complete - completed all work on the block operation * @q: queue holding the block operation * @bio: block operation completed * * This tracepoint indicates there is no further work to do on this * block IO operation @bio. */ TRACE_EVENT(block_bio_complete, TP_PROTO(struct request_queue *q, struct bio *bio), TP_ARGS(q, bio), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned, nr_sector ) __field( int, error ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->error = blk_status_to_errno(bio->bi_status); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u [%d]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->error) ); DECLARE_EVENT_CLASS(block_bio, TP_PROTO(struct bio *bio), TP_ARGS(bio), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %llu + %u [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, __entry->comm) ); /** * block_bio_bounce - used bounce buffer when processing block operation * @bio: block operation * * A bounce buffer was used to handle the block operation @bio in @q. * This occurs when hardware limitations prevent a direct transfer of * data between the @bio data memory area and the IO device. Use of a * bounce buffer requires extra copying of data and decreases * performance. */ DEFINE_EVENT(block_bio, block_bio_bounce, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_backmerge - merging block operation to the end of an existing operation * @bio: new block operation to merge * * Merging block request @bio to the end of an existing block request. */ DEFINE_EVENT(block_bio, block_bio_backmerge, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_frontmerge - merging block operation to the beginning of an existing operation * @bio: new block operation to merge * * Merging block IO operation @bio to the beginning of an existing block request. */ DEFINE_EVENT(block_bio, block_bio_frontmerge, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_bio_queue - putting new block IO operation in queue * @bio: new block operation * * About to place the block IO operation @bio into queue @q. */ DEFINE_EVENT(block_bio, block_bio_queue, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_getrq - get a free request entry in queue for block IO operations * @bio: pending block IO operation (can be %NULL) * * A request struct has been allocated to handle the block IO operation @bio. */ DEFINE_EVENT(block_bio, block_getrq, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); /** * block_plug - keep operations requests in request queue * @q: request queue to plug * * Plug the request queue @q. Do not allow block operation requests * to be sent to the device driver. Instead, accumulate requests in * the queue to improve throughput performance of the block device. */ TRACE_EVENT(block_plug, TP_PROTO(struct request_queue *q), TP_ARGS(q), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("[%s]", __entry->comm) ); DECLARE_EVENT_CLASS(block_unplug, TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), TP_ARGS(q, depth, explicit), TP_STRUCT__entry( __field( int, nr_rq ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->nr_rq = depth; memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("[%s] %d", __entry->comm, __entry->nr_rq) ); /** * block_unplug - release of operations requests in request queue * @q: request queue to unplug * @depth: number of requests just added to the queue * @explicit: whether this was an explicit unplug, or one from schedule() * * Unplug request queue @q because device driver is scheduled to work * on elements in the request queue. */ DEFINE_EVENT(block_unplug, block_unplug, TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit), TP_ARGS(q, depth, explicit) ); /** * block_split - split a single bio struct into two bio structs * @bio: block operation being split * @new_sector: The starting sector for the new bio * * The bio request @bio needs to be split into two bio requests. The newly * created @bio request starts at @new_sector. This split may be required due to * hardware limitations such as operation crossing device boundaries in a RAID * system. */ TRACE_EVENT(block_split, TP_PROTO(struct bio *bio, unsigned int new_sector), TP_ARGS(bio, new_sector), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( sector_t, new_sector ) __array( char, rwbs, RWBS_LEN ) __array( char, comm, TASK_COMM_LEN ) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->new_sector = new_sector; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); memcpy(__entry->comm, current->comm, TASK_COMM_LEN); ), TP_printk("%d,%d %s %llu / %llu [%s]", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, (unsigned long long)__entry->new_sector, __entry->comm) ); /** * block_bio_remap - map request for a logical device to the raw device * @bio: revised operation * @dev: original device for the operation * @from: original sector for the operation * * An operation for a logical device has been mapped to the * raw block device. */ TRACE_EVENT(block_bio_remap, TP_PROTO(struct bio *bio, dev_t dev, sector_t from), TP_ARGS(bio, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = bio_dev(bio); __entry->sector = bio->bi_iter.bi_sector; __entry->nr_sector = bio_sectors(bio); __entry->old_dev = dev; __entry->old_sector = from; blk_fill_rwbs(__entry->rwbs, bio->bi_opf); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), (unsigned long long)__entry->old_sector) ); /** * block_rq_remap - map request for a block operation request * @rq: block IO operation request * @dev: device for the operation * @from: original sector for the operation * * The block operation request @rq in @q has been remapped. The block * operation request @rq holds the current information and @from hold * the original sector. */ TRACE_EVENT(block_rq_remap, TP_PROTO(struct request *rq, dev_t dev, sector_t from), TP_ARGS(rq, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) __field( dev_t, old_dev ) __field( sector_t, old_sector ) __field( unsigned int, nr_bios ) __array( char, rwbs, RWBS_LEN) ), TP_fast_assign( __entry->dev = disk_devt(rq->rq_disk); __entry->sector = blk_rq_pos(rq); __entry->nr_sector = blk_rq_sectors(rq); __entry->old_dev = dev; __entry->old_sector = from; __entry->nr_bios = blk_rq_count_bios(rq); blk_fill_rwbs(__entry->rwbs, rq->cmd_flags); ), TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, (unsigned long long)__entry->sector, __entry->nr_sector, MAJOR(__entry->old_dev), MINOR(__entry->old_dev), (unsigned long long)__entry->old_sector, __entry->nr_bios) ); #endif /* _TRACE_BLOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/act_pedit.c Generic packet editor * * Authors: Jamal Hadi Salim (2002-4) */ #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/module.h> #include <linux/init.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/slab.h> #include <net/ipv6.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <linux/tc_act/tc_pedit.h> #include <net/tc_act/tc_pedit.h> #include <uapi/linux/tc_act/tc_pedit.h> #include <net/pkt_cls.h> static unsigned int pedit_net_id; static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { [TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) }, [TCA_PEDIT_PARMS_EX] = { .len = sizeof(struct tc_pedit) }, [TCA_PEDIT_KEYS_EX] = { .type = NLA_NESTED }, }; static const struct nla_policy pedit_key_ex_policy[TCA_PEDIT_KEY_EX_MAX + 1] = { [TCA_PEDIT_KEY_EX_HTYPE] = { .type = NLA_U16 }, [TCA_PEDIT_KEY_EX_CMD] = { .type = NLA_U16 }, }; static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla, u8 n) { struct tcf_pedit_key_ex *keys_ex; struct tcf_pedit_key_ex *k; const struct nlattr *ka; int err = -EINVAL; int rem; if (!nla) return NULL; keys_ex = kcalloc(n, sizeof(*k), GFP_KERNEL); if (!keys_ex) return ERR_PTR(-ENOMEM); k = keys_ex; nla_for_each_nested(ka, nla, rem) { struct nlattr *tb[TCA_PEDIT_KEY_EX_MAX + 1]; if (!n) { err = -EINVAL; goto err_out; } n--; if (nla_type(ka) != TCA_PEDIT_KEY_EX) { err = -EINVAL; goto err_out; } err = nla_parse_nested_deprecated(tb, TCA_PEDIT_KEY_EX_MAX, ka, pedit_key_ex_policy, NULL); if (err) goto err_out; if (!tb[TCA_PEDIT_KEY_EX_HTYPE] || !tb[TCA_PEDIT_KEY_EX_CMD]) { err = -EINVAL; goto err_out; } k->htype = nla_get_u16(tb[TCA_PEDIT_KEY_EX_HTYPE]); k->cmd = nla_get_u16(tb[TCA_PEDIT_KEY_EX_CMD]); if (k->htype > TCA_PEDIT_HDR_TYPE_MAX || k->cmd > TCA_PEDIT_CMD_MAX) { err = -EINVAL; goto err_out; } k++; } if (n) { err = -EINVAL; goto err_out; } return keys_ex; err_out: kfree(keys_ex); return ERR_PTR(err); } static int tcf_pedit_key_ex_dump(struct sk_buff *skb, struct tcf_pedit_key_ex *keys_ex, int n) { struct nlattr *keys_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEYS_EX); if (!keys_start) goto nla_failure; for (; n > 0; n--) { struct nlattr *key_start; key_start = nla_nest_start_noflag(skb, TCA_PEDIT_KEY_EX); if (!key_start) goto nla_failure; if (nla_put_u16(skb, TCA_PEDIT_KEY_EX_HTYPE, keys_ex->htype) || nla_put_u16(skb, TCA_PEDIT_KEY_EX_CMD, keys_ex->cmd)) goto nla_failure; nla_nest_end(skb, key_start); keys_ex++; } nla_nest_end(skb, keys_start); return 0; nla_failure: nla_nest_cancel(skb, keys_start); return -EINVAL; } static void tcf_pedit_cleanup_rcu(struct rcu_head *head) { struct tcf_pedit_parms *parms = container_of(head, struct tcf_pedit_parms, rcu); kfree(parms->tcfp_keys_ex); kfree(parms->tcfp_keys); kfree(parms); } static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; struct tcf_pedit_parms *oparms, *nparms; struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tc_pedit *parm; struct nlattr *pattr; struct tcf_pedit *p; int ret = 0, err; int i, ksize; u32 index; if (!nla) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires attributes to be passed"); return -EINVAL; } err = nla_parse_nested_deprecated(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); if (err < 0) return err; pattr = tb[TCA_PEDIT_PARMS]; if (!pattr) pattr = tb[TCA_PEDIT_PARMS_EX]; if (!pattr) { NL_SET_ERR_MSG_MOD(extack, "Missing required TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute"); return -EINVAL; } parm = nla_data(pattr); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (!err) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_pedit_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (err > 0) { if (bind) return 0; if (!(flags & TCA_ACT_FLAGS_REPLACE)) { ret = -EEXIST; goto out_release; } } else { return err; } if (!parm->nkeys) { NL_SET_ERR_MSG_MOD(extack, "Pedit requires keys to be passed"); ret = -EINVAL; goto out_release; } ksize = parm->nkeys * sizeof(struct tc_pedit_key); if (nla_len(pattr) < sizeof(*parm) + ksize) { NL_SET_ERR_MSG_ATTR(extack, pattr, "Length of TCA_PEDIT_PARMS or TCA_PEDIT_PARMS_EX pedit attribute is invalid"); ret = -EINVAL; goto out_release; } nparms = kzalloc(sizeof(*nparms), GFP_KERNEL); if (!nparms) { ret = -ENOMEM; goto out_release; } nparms->tcfp_keys_ex = tcf_pedit_keys_ex_parse(tb[TCA_PEDIT_KEYS_EX], parm->nkeys); if (IS_ERR(nparms->tcfp_keys_ex)) { ret = PTR_ERR(nparms->tcfp_keys_ex); goto out_free; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) { ret = err; goto out_free_ex; } nparms->tcfp_off_max_hint = 0; nparms->tcfp_flags = parm->flags; nparms->tcfp_nkeys = parm->nkeys; nparms->tcfp_keys = kmalloc(ksize, GFP_KERNEL); if (!nparms->tcfp_keys) { ret = -ENOMEM; goto put_chain; } memcpy(nparms->tcfp_keys, parm->keys, ksize); for (i = 0; i < nparms->tcfp_nkeys; ++i) { u32 cur = nparms->tcfp_keys[i].off; /* sanitize the shift value for any later use */ nparms->tcfp_keys[i].shift = min_t(size_t, BITS_PER_TYPE(int) - 1, nparms->tcfp_keys[i].shift); /* The AT option can read a single byte, we can bound the actual * value with uchar max. */ cur += (0xff & nparms->tcfp_keys[i].offmask) >> nparms->tcfp_keys[i].shift; /* Each key touches 4 bytes starting from the computed offset */ nparms->tcfp_off_max_hint = max(nparms->tcfp_off_max_hint, cur + 4); } p = to_pedit(*a); spin_lock_bh(&p->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); oparms = rcu_replace_pointer(p->parms, nparms, 1); spin_unlock_bh(&p->tcf_lock); if (oparms) call_rcu(&oparms->rcu, tcf_pedit_cleanup_rcu); if (goto_ch) tcf_chain_put_by_act(goto_ch); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); out_free_ex: kfree(nparms->tcfp_keys_ex); out_free: kfree(nparms); out_release: tcf_idr_release(*a, bind); return ret; } static void tcf_pedit_cleanup(struct tc_action *a) { struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; parms = rcu_dereference_protected(p->parms, 1); if (parms) call_rcu(&parms->rcu, tcf_pedit_cleanup_rcu); } static bool offset_valid(struct sk_buff *skb, int offset) { if (offset > 0 && offset > skb->len) return false; if (offset < 0 && -offset > skb_headroom(skb)) return false; return true; } static int pedit_l4_skb_offset(struct sk_buff *skb, int *hoffset, const int header_type) { const int noff = skb_network_offset(skb); int ret = -EINVAL; struct iphdr _iph; switch (skb->protocol) { case htons(ETH_P_IP): { const struct iphdr *iph = skb_header_pointer(skb, noff, sizeof(_iph), &_iph); if (!iph) goto out; *hoffset = noff + iph->ihl * 4; ret = 0; break; } case htons(ETH_P_IPV6): ret = ipv6_find_hdr(skb, hoffset, header_type, NULL, NULL) == header_type ? 0 : -EINVAL; break; } out: return ret; } static int pedit_skb_hdr_offset(struct sk_buff *skb, enum pedit_header_type htype, int *hoffset) { int ret = -EINVAL; /* 'htype' is validated in the netlink parsing */ switch (htype) { case TCA_PEDIT_KEY_EX_HDR_TYPE_ETH: if (skb_mac_header_was_set(skb)) { *hoffset = skb_mac_offset(skb); ret = 0; } break; case TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK: case TCA_PEDIT_KEY_EX_HDR_TYPE_IP4: case TCA_PEDIT_KEY_EX_HDR_TYPE_IP6: *hoffset = skb_network_offset(skb); ret = 0; break; case TCA_PEDIT_KEY_EX_HDR_TYPE_TCP: ret = pedit_l4_skb_offset(skb, hoffset, IPPROTO_TCP); break; case TCA_PEDIT_KEY_EX_HDR_TYPE_UDP: ret = pedit_l4_skb_offset(skb, hoffset, IPPROTO_UDP); break; default: break; } return ret; } static int tcf_pedit_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { enum pedit_header_type htype = TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; enum pedit_cmd cmd = TCA_PEDIT_KEY_EX_CMD_SET; struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_key_ex *tkey_ex; struct tcf_pedit_parms *parms; struct tc_pedit_key *tkey; u32 max_offset; int i; parms = rcu_dereference_bh(p->parms); max_offset = (skb_transport_header_was_set(skb) ? skb_transport_offset(skb) : skb_network_offset(skb)) + parms->tcfp_off_max_hint; if (skb_ensure_writable(skb, min(skb->len, max_offset))) goto done; tcf_lastuse_update(&p->tcf_tm); tcf_action_update_bstats(&p->common, skb); tkey = parms->tcfp_keys; tkey_ex = parms->tcfp_keys_ex; for (i = parms->tcfp_nkeys; i > 0; i--, tkey++) { int offset = tkey->off; int hoffset = 0; u32 *ptr, hdata; u32 val; int rc; if (tkey_ex) { htype = tkey_ex->htype; cmd = tkey_ex->cmd; tkey_ex++; } rc = pedit_skb_hdr_offset(skb, htype, &hoffset); if (rc) { pr_info_ratelimited("tc action pedit unable to extract header offset for header type (0x%x)\n", htype); goto bad; } if (tkey->offmask) { u8 *d, _d; if (!offset_valid(skb, hoffset + tkey->at)) { pr_info("tc action pedit 'at' offset %d out of bounds\n", hoffset + tkey->at); goto bad; } d = skb_header_pointer(skb, hoffset + tkey->at, sizeof(_d), &_d); if (!d) goto bad; offset += (*d & tkey->offmask) >> tkey->shift; } if (offset % 4) { pr_info("tc action pedit offset must be on 32 bit boundaries\n"); goto bad; } if (!offset_valid(skb, hoffset + offset)) { pr_info("tc action pedit offset %d out of bounds\n", hoffset + offset); goto bad; } ptr = skb_header_pointer(skb, hoffset + offset, sizeof(hdata), &hdata); if (!ptr) goto bad; /* just do it, baby */ switch (cmd) { case TCA_PEDIT_KEY_EX_CMD_SET: val = tkey->val; break; case TCA_PEDIT_KEY_EX_CMD_ADD: val = (*ptr + tkey->val) & ~tkey->mask; break; default: pr_info("tc action pedit bad command (%d)\n", cmd); goto bad; } *ptr = ((*ptr & tkey->mask) ^ val); if (ptr == &hdata) skb_store_bits(skb, hoffset + offset, ptr, 4); } goto done; bad: spin_lock(&p->tcf_lock); p->tcf_qstats.overlimits++; spin_unlock(&p->tcf_lock); done: return p->tcf_action; } static void tcf_pedit_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_pedit *d = to_pedit(a); struct tcf_t *tm = &d->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static int tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_pedit *p = to_pedit(a); struct tcf_pedit_parms *parms; struct tc_pedit *opt; struct tcf_t t; int s; spin_lock_bh(&p->tcf_lock); parms = rcu_dereference_protected(p->parms, 1); s = struct_size(opt, keys, parms->tcfp_nkeys); opt = kzalloc(s, GFP_ATOMIC); if (unlikely(!opt)) { spin_unlock_bh(&p->tcf_lock); return -ENOBUFS; } memcpy(opt->keys, parms->tcfp_keys, flex_array_size(opt, keys, parms->tcfp_nkeys)); opt->index = p->tcf_index; opt->nkeys = parms->tcfp_nkeys; opt->flags = parms->tcfp_flags; opt->action = p->tcf_action; opt->refcnt = refcount_read(&p->tcf_refcnt) - ref; opt->bindcnt = atomic_read(&p->tcf_bindcnt) - bind; if (parms->tcfp_keys_ex) { if (tcf_pedit_key_ex_dump(skb, parms->tcfp_keys_ex, parms->tcfp_nkeys)) goto nla_put_failure; if (nla_put(skb, TCA_PEDIT_PARMS_EX, s, opt)) goto nla_put_failure; } else { if (nla_put(skb, TCA_PEDIT_PARMS, s, opt)) goto nla_put_failure; } tcf_tm_dump(&t, &p->tcf_tm); if (nla_put_64bit(skb, TCA_PEDIT_TM, sizeof(t), &t, TCA_PEDIT_PAD)) goto nla_put_failure; spin_unlock_bh(&p->tcf_lock); kfree(opt); return skb->len; nla_put_failure: spin_unlock_bh(&p->tcf_lock); nlmsg_trim(skb, b); kfree(opt); return -1; } static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, struct netlink_callback *cb, int type, const struct tc_action_ops *ops, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, pedit_net_id); return tcf_generic_walker(tn, skb, cb, type, ops, extack); } static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index) { struct tc_action_net *tn = net_generic(net, pedit_net_id); return tcf_idr_search(tn, a, index); } static struct tc_action_ops act_pedit_ops = { .kind = "pedit", .id = TCA_ID_PEDIT, .owner = THIS_MODULE, .act = tcf_pedit_act, .stats_update = tcf_pedit_stats_update, .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, .walk = tcf_pedit_walker, .lookup = tcf_pedit_search, .size = sizeof(struct tcf_pedit), }; static __net_init int pedit_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, pedit_net_id); return tc_action_net_init(net, tn, &act_pedit_ops); } static void __net_exit pedit_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, pedit_net_id); } static struct pernet_operations pedit_net_ops = { .init = pedit_init_net, .exit_batch = pedit_exit_net, .id = &pedit_net_id, .size = sizeof(struct tc_action_net), }; MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); MODULE_DESCRIPTION("Generic Packet Editor actions"); MODULE_LICENSE("GPL"); static int __init pedit_init_module(void) { return tcf_register_action(&act_pedit_ops, &pedit_net_ops); } static void __exit pedit_cleanup_module(void) { tcf_unregister_action(&act_pedit_ops, &pedit_net_ops); } module_init(pedit_init_module); module_exit(pedit_cleanup_module);
33 33 34 34 34 34 34 34 34 33 33 34 33 34 34 34 34 34 16 16 4 14 15 7 7 4 4 4 4 4 4 4 4 4 4 4 4 4 7 7 34 34 34 34 34 34 34 34 34 34 34 34 26 26 25 26 9 5 1 8 1 8 8 8 8 8 8 8 8 8 6 1 1 12 12 11 11 10 10 11 10 1 9 1 1 6 11 14 3 11 17 2 1 16 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 // SPDX-License-Identifier: GPL-2.0 /* * cfg80211 scan result handling * * Copyright 2008 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2016 Intel Deutschland GmbH * Copyright (C) 2018-2021 Intel Corporation */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/wireless.h> #include <linux/nl80211.h> #include <linux/etherdevice.h> #include <linux/crc32.h> #include <linux/bitfield.h> #include <net/arp.h> #include <net/cfg80211.h> #include <net/cfg80211-wext.h> #include <net/iw_handler.h> #include "core.h" #include "nl80211.h" #include "wext-compat.h" #include "rdev-ops.h" /** * DOC: BSS tree/list structure * * At the top level, the BSS list is kept in both a list in each * registered device (@bss_list) as well as an RB-tree for faster * lookup. In the RB-tree, entries can be looked up using their * channel, MESHID, MESHCONF (for MBSSes) or channel, BSSID, SSID * for other BSSes. * * Due to the possibility of hidden SSIDs, there's a second level * structure, the "hidden_list" and "hidden_beacon_bss" pointer. * The hidden_list connects all BSSes belonging to a single AP * that has a hidden SSID, and connects beacon and probe response * entries. For a probe response entry for a hidden SSID, the * hidden_beacon_bss pointer points to the BSS struct holding the * beacon's information. * * Reference counting is done for all these references except for * the hidden_list, so that a beacon BSS struct that is otherwise * not referenced has one reference for being on the bss_list and * one for each probe response entry that points to it using the * hidden_beacon_bss pointer. When a BSS struct that has such a * pointer is get/put, the refcount update is also propagated to * the referenced struct, this ensure that it cannot get removed * while somebody is using the probe response version. * * Note that the hidden_beacon_bss pointer never changes, due to * the reference counting. Therefore, no locking is needed for * it. * * Also note that the hidden_beacon_bss pointer is only relevant * if the driver uses something other than the IEs, e.g. private * data stored in the BSS struct, since the beacon IEs are * also linked into the probe response struct. */ /* * Limit the number of BSS entries stored in mac80211. Each one is * a bit over 4k at most, so this limits to roughly 4-5M of memory. * If somebody wants to really attack this though, they'd likely * use small beacons, and only one type of frame, limiting each of * the entries to a much smaller size (in order to generate more * entries in total, so overhead is bigger.) */ static int bss_entries_limit = 1000; module_param(bss_entries_limit, int, 0644); MODULE_PARM_DESC(bss_entries_limit, "limit to number of scan BSS entries (per wiphy, default 1000)"); #define IEEE80211_SCAN_RESULT_EXPIRE (30 * HZ) /** * struct cfg80211_colocated_ap - colocated AP information * * @list: linked list to all colocated aPS * @bssid: BSSID of the reported AP * @ssid: SSID of the reported AP * @ssid_len: length of the ssid * @center_freq: frequency the reported AP is on * @unsolicited_probe: the reported AP is part of an ESS, where all the APs * that operate in the same channel as the reported AP and that might be * detected by a STA receiving this frame, are transmitting unsolicited * Probe Response frames every 20 TUs * @oct_recommended: OCT is recommended to exchange MMPDUs with the reported AP * @same_ssid: the reported AP has the same SSID as the reporting AP * @multi_bss: the reported AP is part of a multiple BSSID set * @transmitted_bssid: the reported AP is the transmitting BSSID * @colocated_ess: all the APs that share the same ESS as the reported AP are * colocated and can be discovered via legacy bands. * @short_ssid_valid: short_ssid is valid and can be used * @short_ssid: the short SSID for this SSID */ struct cfg80211_colocated_ap { struct list_head list; u8 bssid[ETH_ALEN]; u8 ssid[IEEE80211_MAX_SSID_LEN]; size_t ssid_len; u32 short_ssid; u32 center_freq; u8 unsolicited_probe:1, oct_recommended:1, same_ssid:1, multi_bss:1, transmitted_bssid:1, colocated_ess:1, short_ssid_valid:1; }; static void bss_free(struct cfg80211_internal_bss *bss) { struct cfg80211_bss_ies *ies; if (WARN_ON(atomic_read(&bss->hold))) return; ies = (void *)rcu_access_pointer(bss->pub.beacon_ies); if (ies && !bss->pub.hidden_beacon_bss) kfree_rcu(ies, rcu_head); ies = (void *)rcu_access_pointer(bss->pub.proberesp_ies); if (ies) kfree_rcu(ies, rcu_head); /* * This happens when the module is removed, it doesn't * really matter any more save for completeness */ if (!list_empty(&bss->hidden_list)) list_del(&bss->hidden_list); kfree(bss); } static inline void bss_ref_get(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); bss->refcount++; if (bss->pub.hidden_beacon_bss) bss_from_pub(bss->pub.hidden_beacon_bss)->refcount++; if (bss->pub.transmitted_bss) bss_from_pub(bss->pub.transmitted_bss)->refcount++; } static inline void bss_ref_put(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); if (bss->pub.hidden_beacon_bss) { struct cfg80211_internal_bss *hbss; hbss = container_of(bss->pub.hidden_beacon_bss, struct cfg80211_internal_bss, pub); hbss->refcount--; if (hbss->refcount == 0) bss_free(hbss); } if (bss->pub.transmitted_bss) { struct cfg80211_internal_bss *tbss; tbss = container_of(bss->pub.transmitted_bss, struct cfg80211_internal_bss, pub); tbss->refcount--; if (tbss->refcount == 0) bss_free(tbss); } bss->refcount--; if (bss->refcount == 0) bss_free(bss); } static bool __cfg80211_unlink_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); if (!list_empty(&bss->hidden_list)) { /* * don't remove the beacon entry if it has * probe responses associated with it */ if (!bss->pub.hidden_beacon_bss) return false; /* * if it's a probe response entry break its * link to the other entries in the group */ list_del_init(&bss->hidden_list); } list_del_init(&bss->list); list_del_init(&bss->pub.nontrans_list); rb_erase(&bss->rbn, &rdev->bss_tree); rdev->bss_entries--; WARN_ONCE((rdev->bss_entries == 0) ^ list_empty(&rdev->bss_list), "rdev bss entries[%d]/list[empty:%d] corruption\n", rdev->bss_entries, list_empty(&rdev->bss_list)); bss_ref_put(rdev, bss); return true; } bool cfg80211_is_element_inherited(const struct element *elem, const struct element *non_inherit_elem) { u8 id_len, ext_id_len, i, loop_len, id; const u8 *list; if (elem->id == WLAN_EID_MULTIPLE_BSSID) return false; if (!non_inherit_elem || non_inherit_elem->datalen < 2) return true; /* * non inheritance element format is: * ext ID (56) | IDs list len | list | extension IDs list len | list * Both lists are optional. Both lengths are mandatory. * This means valid length is: * elem_len = 1 (extension ID) + 2 (list len fields) + list lengths */ id_len = non_inherit_elem->data[1]; if (non_inherit_elem->datalen < 3 + id_len) return true; ext_id_len = non_inherit_elem->data[2 + id_len]; if (non_inherit_elem->datalen < 3 + id_len + ext_id_len) return true; if (elem->id == WLAN_EID_EXTENSION) { if (!ext_id_len) return true; loop_len = ext_id_len; list = &non_inherit_elem->data[3 + id_len]; id = elem->data[0]; } else { if (!id_len) return true; loop_len = id_len; list = &non_inherit_elem->data[2]; id = elem->id; } for (i = 0; i < loop_len; i++) { if (list[i] == id) return false; } return true; } EXPORT_SYMBOL(cfg80211_is_element_inherited); static size_t cfg80211_copy_elem_with_frags(const struct element *elem, const u8 *ie, size_t ie_len, u8 **pos, u8 *buf, size_t buf_len) { if (WARN_ON((u8 *)elem < ie || elem->data > ie + ie_len || elem->data + elem->datalen > ie + ie_len)) return 0; if (elem->datalen + 2 > buf + buf_len - *pos) return 0; memcpy(*pos, elem, elem->datalen + 2); *pos += elem->datalen + 2; /* Finish if it is not fragmented */ if (elem->datalen != 255) return *pos - buf; ie_len = ie + ie_len - elem->data - elem->datalen; ie = (const u8 *)elem->data + elem->datalen; for_each_element(elem, ie, ie_len) { if (elem->id != WLAN_EID_FRAGMENT) break; if (elem->datalen + 2 > buf + buf_len - *pos) return 0; memcpy(*pos, elem, elem->datalen + 2); *pos += elem->datalen + 2; if (elem->datalen != 255) break; } return *pos - buf; } static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, const u8 *subie, size_t subie_len, u8 *new_ie, size_t new_ie_len) { const struct element *non_inherit_elem, *parent, *sub; u8 *pos = new_ie; u8 id, ext_id; unsigned int match_len; non_inherit_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, subie, subie_len); /* We copy the elements one by one from the parent to the generated * elements. * If they are not inherited (included in subie or in the non * inheritance element), then we copy all occurrences the first time * we see this element type. */ for_each_element(parent, ie, ielen) { if (parent->id == WLAN_EID_FRAGMENT) continue; if (parent->id == WLAN_EID_EXTENSION) { if (parent->datalen < 1) continue; id = WLAN_EID_EXTENSION; ext_id = parent->data[0]; match_len = 1; } else { id = parent->id; match_len = 0; } /* Find first occurrence in subie */ sub = cfg80211_find_elem_match(id, subie, subie_len, &ext_id, match_len, 0); /* Copy from parent if not in subie and inherited */ if (!sub && cfg80211_is_element_inherited(parent, non_inherit_elem)) { if (!cfg80211_copy_elem_with_frags(parent, ie, ielen, &pos, new_ie, new_ie_len)) return 0; continue; } /* Already copied if an earlier element had the same type */ if (cfg80211_find_elem_match(id, ie, (u8 *)parent - ie, &ext_id, match_len, 0)) continue; /* Not inheriting, copy all similar elements from subie */ while (sub) { if (!cfg80211_copy_elem_with_frags(sub, subie, subie_len, &pos, new_ie, new_ie_len)) return 0; sub = cfg80211_find_elem_match(id, sub->data + sub->datalen, subie_len + subie - (sub->data + sub->datalen), &ext_id, match_len, 0); } } /* The above misses elements that are included in subie but not in the * parent, so do a pass over subie and append those. * Skip the non-tx BSSID caps and non-inheritance element. */ for_each_element(sub, subie, subie_len) { if (sub->id == WLAN_EID_NON_TX_BSSID_CAP) continue; if (sub->id == WLAN_EID_FRAGMENT) continue; if (sub->id == WLAN_EID_EXTENSION) { if (sub->datalen < 1) continue; id = WLAN_EID_EXTENSION; ext_id = sub->data[0]; match_len = 1; if (ext_id == WLAN_EID_EXT_NON_INHERITANCE) continue; } else { id = sub->id; match_len = 0; } /* Processed if one was included in the parent */ if (cfg80211_find_elem_match(id, ie, ielen, &ext_id, match_len, 0)) continue; if (!cfg80211_copy_elem_with_frags(sub, subie, subie_len, &pos, new_ie, new_ie_len)) return 0; } return pos - new_ie; } static bool is_bss(struct cfg80211_bss *a, const u8 *bssid, const u8 *ssid, size_t ssid_len) { const struct cfg80211_bss_ies *ies; const u8 *ssidie; if (bssid && !ether_addr_equal(a->bssid, bssid)) return false; if (!ssid) return true; ies = rcu_access_pointer(a->ies); if (!ies) return false; ssidie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len); if (!ssidie) return false; if (ssidie[1] != ssid_len) return false; return memcmp(ssidie + 2, ssid, ssid_len) == 0; } static int cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss, struct cfg80211_bss *nontrans_bss) { const u8 *ssid; size_t ssid_len; struct cfg80211_bss *bss = NULL; rcu_read_lock(); ssid = ieee80211_bss_get_ie(nontrans_bss, WLAN_EID_SSID); if (!ssid) { rcu_read_unlock(); return -EINVAL; } ssid_len = ssid[1]; ssid = ssid + 2; /* check if nontrans_bss is in the list */ list_for_each_entry(bss, &trans_bss->nontrans_list, nontrans_list) { if (is_bss(bss, nontrans_bss->bssid, ssid, ssid_len)) { rcu_read_unlock(); return 0; } } rcu_read_unlock(); /* * This is a bit weird - it's not on the list, but already on another * one! The only way that could happen is if there's some BSSID/SSID * shared by multiple APs in their multi-BSSID profiles, potentially * with hidden SSID mixed in ... ignore it. */ if (!list_empty(&nontrans_bss->nontrans_list)) return -EINVAL; /* add to the list */ list_add_tail(&nontrans_bss->nontrans_list, &trans_bss->nontrans_list); return 0; } static void __cfg80211_bss_expire(struct cfg80211_registered_device *rdev, unsigned long expire_time) { struct cfg80211_internal_bss *bss, *tmp; bool expired = false; lockdep_assert_held(&rdev->bss_lock); list_for_each_entry_safe(bss, tmp, &rdev->bss_list, list) { if (atomic_read(&bss->hold)) continue; if (!time_after(expire_time, bss->ts)) continue; if (__cfg80211_unlink_bss(rdev, bss)) expired = true; } if (expired) rdev->bss_generation++; } static bool cfg80211_bss_expire_oldest(struct cfg80211_registered_device *rdev) { struct cfg80211_internal_bss *bss, *oldest = NULL; bool ret; lockdep_assert_held(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) { if (atomic_read(&bss->hold)) continue; if (!list_empty(&bss->hidden_list) && !bss->pub.hidden_beacon_bss) continue; if (oldest && time_before(oldest->ts, bss->ts)) continue; oldest = bss; } if (WARN_ON(!oldest)) return false; /* * The callers make sure to increase rdev->bss_generation if anything * gets removed (and a new entry added), so there's no need to also do * it here. */ ret = __cfg80211_unlink_bss(rdev, oldest); WARN_ON(!ret); return ret; } static u8 cfg80211_parse_bss_param(u8 data, struct cfg80211_colocated_ap *coloc_ap) { coloc_ap->oct_recommended = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_OCT_RECOMMENDED); coloc_ap->same_ssid = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_SAME_SSID); coloc_ap->multi_bss = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_MULTI_BSSID); coloc_ap->transmitted_bssid = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_TRANSMITTED_BSSID); coloc_ap->unsolicited_probe = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_PROBE_ACTIVE); coloc_ap->colocated_ess = u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_ESS); return u8_get_bits(data, IEEE80211_RNR_TBTT_PARAMS_COLOC_AP); } static int cfg80211_calc_short_ssid(const struct cfg80211_bss_ies *ies, const struct element **elem, u32 *s_ssid) { *elem = cfg80211_find_elem(WLAN_EID_SSID, ies->data, ies->len); if (!*elem || (*elem)->datalen > IEEE80211_MAX_SSID_LEN) return -EINVAL; *s_ssid = ~crc32_le(~0, (*elem)->data, (*elem)->datalen); return 0; } static void cfg80211_free_coloc_ap_list(struct list_head *coloc_ap_list) { struct cfg80211_colocated_ap *ap, *tmp_ap; list_for_each_entry_safe(ap, tmp_ap, coloc_ap_list, list) { list_del(&ap->list); kfree(ap); } } static int cfg80211_parse_ap_info(struct cfg80211_colocated_ap *entry, const u8 *pos, u8 length, const struct element *ssid_elem, int s_ssid_tmp) { /* skip the TBTT offset */ pos++; memcpy(entry->bssid, pos, ETH_ALEN); pos += ETH_ALEN; if (length == IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM) { memcpy(&entry->short_ssid, pos, sizeof(entry->short_ssid)); entry->short_ssid_valid = true; pos += 4; } /* skip non colocated APs */ if (!cfg80211_parse_bss_param(*pos, entry)) return -EINVAL; pos++; if (length == IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM) { /* * no information about the short ssid. Consider the entry valid * for now. It would later be dropped in case there are explicit * SSIDs that need to be matched */ if (!entry->same_ssid) return 0; } if (entry->same_ssid) { entry->short_ssid = s_ssid_tmp; entry->short_ssid_valid = true; /* * This is safe because we validate datalen in * cfg80211_parse_colocated_ap(), before calling this * function. */ memcpy(&entry->ssid, &ssid_elem->data, ssid_elem->datalen); entry->ssid_len = ssid_elem->datalen; } return 0; } static int cfg80211_parse_colocated_ap(const struct cfg80211_bss_ies *ies, struct list_head *list) { struct ieee80211_neighbor_ap_info *ap_info; const struct element *elem, *ssid_elem; const u8 *pos, *end; u32 s_ssid_tmp; int n_coloc = 0, ret; LIST_HEAD(ap_list); elem = cfg80211_find_elem(WLAN_EID_REDUCED_NEIGHBOR_REPORT, ies->data, ies->len); if (!elem) return 0; pos = elem->data; end = pos + elem->datalen; ret = cfg80211_calc_short_ssid(ies, &ssid_elem, &s_ssid_tmp); if (ret) return 0; /* RNR IE may contain more than one NEIGHBOR_AP_INFO */ while (pos + sizeof(*ap_info) <= end) { enum nl80211_band band; int freq; u8 length, i, count; ap_info = (void *)pos; count = u8_get_bits(ap_info->tbtt_info_hdr, IEEE80211_AP_INFO_TBTT_HDR_COUNT) + 1; length = ap_info->tbtt_info_len; pos += sizeof(*ap_info); if (!ieee80211_operating_class_to_band(ap_info->op_class, &band)) break; freq = ieee80211_channel_to_frequency(ap_info->channel, band); if (end - pos < count * length) break; /* * TBTT info must include bss param + BSSID + * (short SSID or same_ssid bit to be set). * ignore other options, and move to the * next AP info */ if (band != NL80211_BAND_6GHZ || (length != IEEE80211_TBTT_INFO_OFFSET_BSSID_BSS_PARAM && length < IEEE80211_TBTT_INFO_OFFSET_BSSID_SSSID_BSS_PARAM)) { pos += count * length; continue; } for (i = 0; i < count; i++) { struct cfg80211_colocated_ap *entry; entry = kzalloc(sizeof(*entry) + IEEE80211_MAX_SSID_LEN, GFP_ATOMIC); if (!entry) break; entry->center_freq = freq; if (!cfg80211_parse_ap_info(entry, pos, length, ssid_elem, s_ssid_tmp)) { n_coloc++; list_add_tail(&entry->list, &ap_list); } else { kfree(entry); } pos += length; } } if (pos != end) { cfg80211_free_coloc_ap_list(&ap_list); return 0; } list_splice_tail(&ap_list, list); return n_coloc; } static void cfg80211_scan_req_add_chan(struct cfg80211_scan_request *request, struct ieee80211_channel *chan, bool add_to_6ghz) { int i; u32 n_channels = request->n_channels; struct cfg80211_scan_6ghz_params *params = &request->scan_6ghz_params[request->n_6ghz_params]; for (i = 0; i < n_channels; i++) { if (request->channels[i] == chan) { if (add_to_6ghz) params->channel_idx = i; return; } } request->channels[n_channels] = chan; if (add_to_6ghz) request->scan_6ghz_params[request->n_6ghz_params].channel_idx = n_channels; request->n_channels++; } static bool cfg80211_find_ssid_match(struct cfg80211_colocated_ap *ap, struct cfg80211_scan_request *request) { int i; u32 s_ssid; for (i = 0; i < request->n_ssids; i++) { /* wildcard ssid in the scan request */ if (!request->ssids[i].ssid_len) { if (ap->multi_bss && !ap->transmitted_bssid) continue; return true; } if (ap->ssid_len && ap->ssid_len == request->ssids[i].ssid_len) { if (!memcmp(request->ssids[i].ssid, ap->ssid, ap->ssid_len)) return true; } else if (ap->short_ssid_valid) { s_ssid = ~crc32_le(~0, request->ssids[i].ssid, request->ssids[i].ssid_len); if (ap->short_ssid == s_ssid) return true; } } return false; } static int cfg80211_scan_6ghz(struct cfg80211_registered_device *rdev) { u8 i; struct cfg80211_colocated_ap *ap; int n_channels, count = 0, err; struct cfg80211_scan_request *request, *rdev_req = rdev->scan_req; LIST_HEAD(coloc_ap_list); bool need_scan_psc = true; const struct ieee80211_sband_iftype_data *iftd; rdev_req->scan_6ghz = true; if (!rdev->wiphy.bands[NL80211_BAND_6GHZ]) return -EOPNOTSUPP; iftd = ieee80211_get_sband_iftype_data(rdev->wiphy.bands[NL80211_BAND_6GHZ], rdev_req->wdev->iftype); if (!iftd || !iftd->he_cap.has_he) return -EOPNOTSUPP; n_channels = rdev->wiphy.bands[NL80211_BAND_6GHZ]->n_channels; if (rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ) { struct cfg80211_internal_bss *intbss; spin_lock_bh(&rdev->bss_lock); list_for_each_entry(intbss, &rdev->bss_list, list) { struct cfg80211_bss *res = &intbss->pub; const struct cfg80211_bss_ies *ies; const struct element *ssid_elem; struct cfg80211_colocated_ap *entry; u32 s_ssid_tmp; int ret; ies = rcu_access_pointer(res->ies); count += cfg80211_parse_colocated_ap(ies, &coloc_ap_list); /* In case the scan request specified a specific BSSID * and the BSS is found and operating on 6GHz band then * add this AP to the collocated APs list. * This is relevant for ML probe requests when the lower * band APs have not been discovered. */ if (is_broadcast_ether_addr(rdev_req->bssid) || !ether_addr_equal(rdev_req->bssid, res->bssid) || res->channel->band != NL80211_BAND_6GHZ) continue; ret = cfg80211_calc_short_ssid(ies, &ssid_elem, &s_ssid_tmp); if (ret) continue; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) continue; memcpy(entry->bssid, res->bssid, ETH_ALEN); entry->short_ssid = s_ssid_tmp; memcpy(entry->ssid, ssid_elem->data, ssid_elem->datalen); entry->ssid_len = ssid_elem->datalen; entry->short_ssid_valid = true; entry->center_freq = res->channel->center_freq; list_add_tail(&entry->list, &coloc_ap_list); count++; } spin_unlock_bh(&rdev->bss_lock); } request = kzalloc(struct_size(request, channels, n_channels) + sizeof(*request->scan_6ghz_params) * count + sizeof(*request->ssids) * rdev_req->n_ssids, GFP_KERNEL); if (!request) { cfg80211_free_coloc_ap_list(&coloc_ap_list); return -ENOMEM; } *request = *rdev_req; request->n_channels = 0; request->scan_6ghz_params = (void *)&request->channels[n_channels]; /* * PSC channels should not be scanned in case of direct scan with 1 SSID * and at least one of the reported co-located APs with same SSID * indicating that all APs in the same ESS are co-located */ if (count && request->n_ssids == 1 && request->ssids[0].ssid_len) { list_for_each_entry(ap, &coloc_ap_list, list) { if (ap->colocated_ess && cfg80211_find_ssid_match(ap, request)) { need_scan_psc = false; break; } } } /* * add to the scan request the channels that need to be scanned * regardless of the collocated APs (PSC channels or all channels * in case that NL80211_SCAN_FLAG_COLOCATED_6GHZ is not set) */ for (i = 0; i < rdev_req->n_channels; i++) { if (rdev_req->channels[i]->band == NL80211_BAND_6GHZ && ((need_scan_psc && cfg80211_channel_is_psc(rdev_req->channels[i])) || !(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ))) { cfg80211_scan_req_add_chan(request, rdev_req->channels[i], false); } } if (!(rdev_req->flags & NL80211_SCAN_FLAG_COLOCATED_6GHZ)) goto skip; list_for_each_entry(ap, &coloc_ap_list, list) { bool found = false; struct cfg80211_scan_6ghz_params *scan_6ghz_params = &request->scan_6ghz_params[request->n_6ghz_params]; struct ieee80211_channel *chan = ieee80211_get_channel(&rdev->wiphy, ap->center_freq); if (!chan || chan->flags & IEEE80211_CHAN_DISABLED) continue; for (i = 0; i < rdev_req->n_channels; i++) { if (rdev_req->channels[i] == chan) found = true; } if (!found) continue; if (request->n_ssids > 0 && !cfg80211_find_ssid_match(ap, request)) continue; if (!is_broadcast_ether_addr(request->bssid) && !ether_addr_equal(request->bssid, ap->bssid)) continue; if (!request->n_ssids && ap->multi_bss && !ap->transmitted_bssid) continue; cfg80211_scan_req_add_chan(request, chan, true); memcpy(scan_6ghz_params->bssid, ap->bssid, ETH_ALEN); scan_6ghz_params->short_ssid = ap->short_ssid; scan_6ghz_params->short_ssid_valid = ap->short_ssid_valid; scan_6ghz_params->unsolicited_probe = ap->unsolicited_probe; /* * If a PSC channel is added to the scan and 'need_scan_psc' is * set to false, then all the APs that the scan logic is * interested with on the channel are collocated and thus there * is no need to perform the initial PSC channel listen. */ if (cfg80211_channel_is_psc(chan) && !need_scan_psc) scan_6ghz_params->psc_no_listen = true; request->n_6ghz_params++; } skip: cfg80211_free_coloc_ap_list(&coloc_ap_list); if (request->n_channels) { struct cfg80211_scan_request *old = rdev->int_scan_req; rdev->int_scan_req = request; /* * Add the ssids from the parent scan request to the new scan * request, so the driver would be able to use them in its * probe requests to discover hidden APs on PSC channels. */ request->ssids = (void *)&request->channels[request->n_channels]; request->n_ssids = rdev_req->n_ssids; memcpy(request->ssids, rdev_req->ssids, sizeof(*request->ssids) * request->n_ssids); /* * If this scan follows a previous scan, save the scan start * info from the first part of the scan */ if (old) rdev->int_scan_req->info = old->info; err = rdev_scan(rdev, request); if (err) { rdev->int_scan_req = old; kfree(request); } else { kfree(old); } return err; } kfree(request); return -EINVAL; } int cfg80211_scan(struct cfg80211_registered_device *rdev) { struct cfg80211_scan_request *request; struct cfg80211_scan_request *rdev_req = rdev->scan_req; u32 n_channels = 0, idx, i; if (!(rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ)) return rdev_scan(rdev, rdev_req); for (i = 0; i < rdev_req->n_channels; i++) { if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) n_channels++; } if (!n_channels) return cfg80211_scan_6ghz(rdev); request = kzalloc(struct_size(request, channels, n_channels), GFP_KERNEL); if (!request) return -ENOMEM; *request = *rdev_req; request->n_channels = n_channels; for (i = idx = 0; i < rdev_req->n_channels; i++) { if (rdev_req->channels[i]->band != NL80211_BAND_6GHZ) request->channels[idx++] = rdev_req->channels[i]; } rdev_req->scan_6ghz = false; rdev->int_scan_req = request; return rdev_scan(rdev, request); } void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message) { struct cfg80211_scan_request *request, *rdev_req; struct wireless_dev *wdev; struct sk_buff *msg; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif lockdep_assert_held(&rdev->wiphy.mtx); if (rdev->scan_msg) { nl80211_send_scan_msg(rdev, rdev->scan_msg); rdev->scan_msg = NULL; return; } rdev_req = rdev->scan_req; if (!rdev_req) return; wdev = rdev_req->wdev; request = rdev->int_scan_req ? rdev->int_scan_req : rdev_req; if (wdev_running(wdev) && (rdev->wiphy.flags & WIPHY_FLAG_SPLIT_SCAN_6GHZ) && !rdev_req->scan_6ghz && !request->info.aborted && !cfg80211_scan_6ghz(rdev)) return; /* * This must be before sending the other events! * Otherwise, wpa_supplicant gets completely confused with * wext events. */ if (wdev->netdev) cfg80211_sme_scan_done(wdev->netdev); if (!request->info.aborted && request->flags & NL80211_SCAN_FLAG_FLUSH) { /* flush entries from previous scans */ spin_lock_bh(&rdev->bss_lock); __cfg80211_bss_expire(rdev, request->scan_start); spin_unlock_bh(&rdev->bss_lock); } msg = nl80211_build_scan_msg(rdev, wdev, request->info.aborted); #ifdef CONFIG_CFG80211_WEXT if (wdev->netdev && !request->info.aborted) { memset(&wrqu, 0, sizeof(wrqu)); wireless_send_event(wdev->netdev, SIOCGIWSCAN, &wrqu, NULL); } #endif dev_put(wdev->netdev); kfree(rdev->int_scan_req); rdev->int_scan_req = NULL; kfree(rdev->scan_req); rdev->scan_req = NULL; if (!send_message) rdev->scan_msg = msg; else nl80211_send_scan_msg(rdev, msg); } void __cfg80211_scan_done(struct work_struct *wk) { struct cfg80211_registered_device *rdev; rdev = container_of(wk, struct cfg80211_registered_device, scan_done_wk); wiphy_lock(&rdev->wiphy); ___cfg80211_scan_done(rdev, true); wiphy_unlock(&rdev->wiphy); } void cfg80211_scan_done(struct cfg80211_scan_request *request, struct cfg80211_scan_info *info) { struct cfg80211_scan_info old_info = request->info; trace_cfg80211_scan_done(request, info); WARN_ON(request != wiphy_to_rdev(request->wiphy)->scan_req && request != wiphy_to_rdev(request->wiphy)->int_scan_req); request->info = *info; /* * In case the scan is split, the scan_start_tsf and tsf_bssid should * be of the first part. In such a case old_info.scan_start_tsf should * be non zero. */ if (request->scan_6ghz && old_info.scan_start_tsf) { request->info.scan_start_tsf = old_info.scan_start_tsf; memcpy(request->info.tsf_bssid, old_info.tsf_bssid, sizeof(request->info.tsf_bssid)); } request->notified = true; queue_work(cfg80211_wq, &wiphy_to_rdev(request->wiphy)->scan_done_wk); } EXPORT_SYMBOL(cfg80211_scan_done); void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req) { lockdep_assert_held(&rdev->wiphy.mtx); list_add_rcu(&req->list, &rdev->sched_scan_req_list); } static void cfg80211_del_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req) { lockdep_assert_held(&rdev->wiphy.mtx); list_del_rcu(&req->list); kfree_rcu(req, rcu_head); } static struct cfg80211_sched_scan_request * cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid) { struct cfg80211_sched_scan_request *pos; list_for_each_entry_rcu(pos, &rdev->sched_scan_req_list, list, lockdep_is_held(&rdev->wiphy.mtx)) { if (pos->reqid == reqid) return pos; } return NULL; } /* * Determines if a scheduled scan request can be handled. When a legacy * scheduled scan is running no other scheduled scan is allowed regardless * whether the request is for legacy or multi-support scan. When a multi-support * scheduled scan is running a request for legacy scan is not allowed. In this * case a request for multi-support scan can be handled if resources are * available, ie. struct wiphy::max_sched_scan_reqs limit is not yet reached. */ int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev, bool want_multi) { struct cfg80211_sched_scan_request *pos; int i = 0; list_for_each_entry(pos, &rdev->sched_scan_req_list, list) { /* request id zero means legacy in progress */ if (!i && !pos->reqid) return -EINPROGRESS; i++; } if (i) { /* no legacy allowed when multi request(s) are active */ if (!want_multi) return -EINPROGRESS; /* resource limit reached */ if (i == rdev->wiphy.max_sched_scan_reqs) return -ENOSPC; } return 0; } void cfg80211_sched_scan_results_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; struct cfg80211_sched_scan_request *req, *tmp; rdev = container_of(work, struct cfg80211_registered_device, sched_scan_res_wk); wiphy_lock(&rdev->wiphy); list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { if (req->report_results) { req->report_results = false; if (req->flags & NL80211_SCAN_FLAG_FLUSH) { /* flush entries from previous scans */ spin_lock_bh(&rdev->bss_lock); __cfg80211_bss_expire(rdev, req->scan_start); spin_unlock_bh(&rdev->bss_lock); req->scan_start = jiffies; } nl80211_send_sched_scan(req, NL80211_CMD_SCHED_SCAN_RESULTS); } } wiphy_unlock(&rdev->wiphy); } void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_sched_scan_request *request; trace_cfg80211_sched_scan_results(wiphy, reqid); /* ignore if we're not scanning */ rcu_read_lock(); request = cfg80211_find_sched_scan_req(rdev, reqid); if (request) { request->report_results = true; queue_work(cfg80211_wq, &rdev->sched_scan_res_wk); } rcu_read_unlock(); } EXPORT_SYMBOL(cfg80211_sched_scan_results); void cfg80211_sched_scan_stopped_locked(struct wiphy *wiphy, u64 reqid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); lockdep_assert_held(&wiphy->mtx); trace_cfg80211_sched_scan_stopped(wiphy, reqid); __cfg80211_stop_sched_scan(rdev, reqid, true); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped_locked); void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid) { wiphy_lock(wiphy); cfg80211_sched_scan_stopped_locked(wiphy, reqid); wiphy_unlock(wiphy); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped); int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *req, bool driver_initiated) { lockdep_assert_held(&rdev->wiphy.mtx); if (!driver_initiated) { int err = rdev_sched_scan_stop(rdev, req->dev, req->reqid); if (err) return err; } nl80211_send_sched_scan(req, NL80211_CMD_SCHED_SCAN_STOPPED); cfg80211_del_sched_scan_req(rdev, req); return 0; } int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, u64 reqid, bool driver_initiated) { struct cfg80211_sched_scan_request *sched_scan_req; lockdep_assert_held(&rdev->wiphy.mtx); sched_scan_req = cfg80211_find_sched_scan_req(rdev, reqid); if (!sched_scan_req) return -ENOENT; return cfg80211_stop_sched_scan_req(rdev, sched_scan_req, driver_initiated); } void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs) { struct cfg80211_internal_bss *bss; unsigned long age_jiffies = msecs_to_jiffies(age_secs * MSEC_PER_SEC); spin_lock_bh(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) bss->ts -= age_jiffies; spin_unlock_bh(&rdev->bss_lock); } void cfg80211_bss_expire(struct cfg80211_registered_device *rdev) { __cfg80211_bss_expire(rdev, jiffies - IEEE80211_SCAN_RESULT_EXPIRE); } void cfg80211_bss_flush(struct wiphy *wiphy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); spin_lock_bh(&rdev->bss_lock); __cfg80211_bss_expire(rdev, jiffies); spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_bss_flush); const struct element * cfg80211_find_elem_match(u8 eid, const u8 *ies, unsigned int len, const u8 *match, unsigned int match_len, unsigned int match_offset) { const struct element *elem; for_each_element_id(elem, eid, ies, len) { if (elem->datalen >= match_offset + match_len && !memcmp(elem->data + match_offset, match, match_len)) return elem; } return NULL; } EXPORT_SYMBOL(cfg80211_find_elem_match); const struct element *cfg80211_find_vendor_elem(unsigned int oui, int oui_type, const u8 *ies, unsigned int len) { const struct element *elem; u8 match[] = { oui >> 16, oui >> 8, oui, oui_type }; int match_len = (oui_type < 0) ? 3 : sizeof(match); if (WARN_ON(oui_type > 0xff)) return NULL; elem = cfg80211_find_elem_match(WLAN_EID_VENDOR_SPECIFIC, ies, len, match, match_len, 0); if (!elem || elem->datalen < 4) return NULL; return elem; } EXPORT_SYMBOL(cfg80211_find_vendor_elem); /** * enum bss_compare_mode - BSS compare mode * @BSS_CMP_REGULAR: regular compare mode (for insertion and normal find) * @BSS_CMP_HIDE_ZLEN: find hidden SSID with zero-length mode * @BSS_CMP_HIDE_NUL: find hidden SSID with NUL-ed out mode */ enum bss_compare_mode { BSS_CMP_REGULAR, BSS_CMP_HIDE_ZLEN, BSS_CMP_HIDE_NUL, }; static int cmp_bss(struct cfg80211_bss *a, struct cfg80211_bss *b, enum bss_compare_mode mode) { const struct cfg80211_bss_ies *a_ies, *b_ies; const u8 *ie1 = NULL; const u8 *ie2 = NULL; int i, r; if (a->channel != b->channel) return b->channel->center_freq - a->channel->center_freq; a_ies = rcu_access_pointer(a->ies); if (!a_ies) return -1; b_ies = rcu_access_pointer(b->ies); if (!b_ies) return 1; if (WLAN_CAPABILITY_IS_STA_BSS(a->capability)) ie1 = cfg80211_find_ie(WLAN_EID_MESH_ID, a_ies->data, a_ies->len); if (WLAN_CAPABILITY_IS_STA_BSS(b->capability)) ie2 = cfg80211_find_ie(WLAN_EID_MESH_ID, b_ies->data, b_ies->len); if (ie1 && ie2) { int mesh_id_cmp; if (ie1[1] == ie2[1]) mesh_id_cmp = memcmp(ie1 + 2, ie2 + 2, ie1[1]); else mesh_id_cmp = ie2[1] - ie1[1]; ie1 = cfg80211_find_ie(WLAN_EID_MESH_CONFIG, a_ies->data, a_ies->len); ie2 = cfg80211_find_ie(WLAN_EID_MESH_CONFIG, b_ies->data, b_ies->len); if (ie1 && ie2) { if (mesh_id_cmp) return mesh_id_cmp; if (ie1[1] != ie2[1]) return ie2[1] - ie1[1]; return memcmp(ie1 + 2, ie2 + 2, ie1[1]); } } r = memcmp(a->bssid, b->bssid, sizeof(a->bssid)); if (r) return r; ie1 = cfg80211_find_ie(WLAN_EID_SSID, a_ies->data, a_ies->len); ie2 = cfg80211_find_ie(WLAN_EID_SSID, b_ies->data, b_ies->len); if (!ie1 && !ie2) return 0; /* * Note that with "hide_ssid", the function returns a match if * the already-present BSS ("b") is a hidden SSID beacon for * the new BSS ("a"). */ /* sort missing IE before (left of) present IE */ if (!ie1) return -1; if (!ie2) return 1; switch (mode) { case BSS_CMP_HIDE_ZLEN: /* * In ZLEN mode we assume the BSS entry we're * looking for has a zero-length SSID. So if * the one we're looking at right now has that, * return 0. Otherwise, return the difference * in length, but since we're looking for the * 0-length it's really equivalent to returning * the length of the one we're looking at. * * No content comparison is needed as we assume * the content length is zero. */ return ie2[1]; case BSS_CMP_REGULAR: default: /* sort by length first, then by contents */ if (ie1[1] != ie2[1]) return ie2[1] - ie1[1]; return memcmp(ie1 + 2, ie2 + 2, ie1[1]); case BSS_CMP_HIDE_NUL: if (ie1[1] != ie2[1]) return ie2[1] - ie1[1]; /* this is equivalent to memcmp(zeroes, ie2 + 2, len) */ for (i = 0; i < ie2[1]; i++) if (ie2[i + 2]) return -1; return 0; } } static bool cfg80211_bss_type_match(u16 capability, enum nl80211_band band, enum ieee80211_bss_type bss_type) { bool ret = true; u16 mask, val; if (bss_type == IEEE80211_BSS_TYPE_ANY) return ret; if (band == NL80211_BAND_60GHZ) { mask = WLAN_CAPABILITY_DMG_TYPE_MASK; switch (bss_type) { case IEEE80211_BSS_TYPE_ESS: val = WLAN_CAPABILITY_DMG_TYPE_AP; break; case IEEE80211_BSS_TYPE_PBSS: val = WLAN_CAPABILITY_DMG_TYPE_PBSS; break; case IEEE80211_BSS_TYPE_IBSS: val = WLAN_CAPABILITY_DMG_TYPE_IBSS; break; default: return false; } } else { mask = WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS; switch (bss_type) { case IEEE80211_BSS_TYPE_ESS: val = WLAN_CAPABILITY_ESS; break; case IEEE80211_BSS_TYPE_IBSS: val = WLAN_CAPABILITY_IBSS; break; case IEEE80211_BSS_TYPE_MBSS: val = 0; break; default: return false; } } ret = ((capability & mask) == val); return ret; } /* Returned bss is reference counted and must be cleaned up appropriately. */ struct cfg80211_bss *cfg80211_get_bss(struct wiphy *wiphy, struct ieee80211_channel *channel, const u8 *bssid, const u8 *ssid, size_t ssid_len, enum ieee80211_bss_type bss_type, enum ieee80211_privacy privacy) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss, *res = NULL; unsigned long now = jiffies; int bss_privacy; trace_cfg80211_get_bss(wiphy, channel, bssid, ssid, ssid_len, bss_type, privacy); spin_lock_bh(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) { if (!cfg80211_bss_type_match(bss->pub.capability, bss->pub.channel->band, bss_type)) continue; bss_privacy = (bss->pub.capability & WLAN_CAPABILITY_PRIVACY); if ((privacy == IEEE80211_PRIVACY_ON && !bss_privacy) || (privacy == IEEE80211_PRIVACY_OFF && bss_privacy)) continue; if (channel && bss->pub.channel != channel) continue; if (!is_valid_ether_addr(bss->pub.bssid)) continue; /* Don't get expired BSS structs */ if (time_after(now, bss->ts + IEEE80211_SCAN_RESULT_EXPIRE) && !atomic_read(&bss->hold)) continue; if (is_bss(&bss->pub, bssid, ssid, ssid_len)) { res = bss; bss_ref_get(rdev, res); break; } } spin_unlock_bh(&rdev->bss_lock); if (!res) return NULL; trace_cfg80211_return_bss(&res->pub); return &res->pub; } EXPORT_SYMBOL(cfg80211_get_bss); static bool rb_insert_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { struct rb_node **p = &rdev->bss_tree.rb_node; struct rb_node *parent = NULL; struct cfg80211_internal_bss *tbss; int cmp; while (*p) { parent = *p; tbss = rb_entry(parent, struct cfg80211_internal_bss, rbn); cmp = cmp_bss(&bss->pub, &tbss->pub, BSS_CMP_REGULAR); if (WARN_ON(!cmp)) { /* will sort of leak this BSS */ return false; } if (cmp < 0) p = &(*p)->rb_left; else p = &(*p)->rb_right; } rb_link_node(&bss->rbn, parent, p); rb_insert_color(&bss->rbn, &rdev->bss_tree); return true; } static struct cfg80211_internal_bss * rb_find_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *res, enum bss_compare_mode mode) { struct rb_node *n = rdev->bss_tree.rb_node; struct cfg80211_internal_bss *bss; int r; while (n) { bss = rb_entry(n, struct cfg80211_internal_bss, rbn); r = cmp_bss(&res->pub, &bss->pub, mode); if (r == 0) return bss; else if (r < 0) n = n->rb_left; else n = n->rb_right; } return NULL; } static void cfg80211_insert_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); if (!rb_insert_bss(rdev, bss)) return; list_add_tail(&bss->list, &rdev->bss_list); rdev->bss_entries++; } static void cfg80211_rehash_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *bss) { lockdep_assert_held(&rdev->bss_lock); rb_erase(&bss->rbn, &rdev->bss_tree); if (!rb_insert_bss(rdev, bss)) { list_del(&bss->list); if (!list_empty(&bss->hidden_list)) list_del_init(&bss->hidden_list); if (!list_empty(&bss->pub.nontrans_list)) list_del_init(&bss->pub.nontrans_list); rdev->bss_entries--; } rdev->bss_generation++; } static bool cfg80211_combine_bsses(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *new) { const struct cfg80211_bss_ies *ies; struct cfg80211_internal_bss *bss; const u8 *ie; int i, ssidlen; u8 fold = 0; u32 n_entries = 0; ies = rcu_access_pointer(new->pub.beacon_ies); if (WARN_ON(!ies)) return false; ie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len); if (!ie) { /* nothing to do */ return true; } ssidlen = ie[1]; for (i = 0; i < ssidlen; i++) fold |= ie[2 + i]; if (fold) { /* not a hidden SSID */ return true; } /* This is the bad part ... */ list_for_each_entry(bss, &rdev->bss_list, list) { /* * we're iterating all the entries anyway, so take the * opportunity to validate the list length accounting */ n_entries++; if (!ether_addr_equal(bss->pub.bssid, new->pub.bssid)) continue; if (bss->pub.channel != new->pub.channel) continue; if (bss->pub.scan_width != new->pub.scan_width) continue; if (rcu_access_pointer(bss->pub.beacon_ies)) continue; ies = rcu_access_pointer(bss->pub.ies); if (!ies) continue; ie = cfg80211_find_ie(WLAN_EID_SSID, ies->data, ies->len); if (!ie) continue; if (ssidlen && ie[1] != ssidlen) continue; if (WARN_ON_ONCE(bss->pub.hidden_beacon_bss)) continue; if (WARN_ON_ONCE(!list_empty(&bss->hidden_list))) list_del(&bss->hidden_list); /* combine them */ list_add(&bss->hidden_list, &new->hidden_list); bss->pub.hidden_beacon_bss = &new->pub; new->refcount += bss->refcount; rcu_assign_pointer(bss->pub.beacon_ies, new->pub.beacon_ies); } WARN_ONCE(n_entries != rdev->bss_entries, "rdev bss entries[%d]/list[len:%d] corruption\n", rdev->bss_entries, n_entries); return true; } struct cfg80211_non_tx_bss { struct cfg80211_bss *tx_bss; u8 max_bssid_indicator; u8 bssid_index; }; static void cfg80211_update_hidden_bsses(struct cfg80211_internal_bss *known, const struct cfg80211_bss_ies *new_ies, const struct cfg80211_bss_ies *old_ies) { struct cfg80211_internal_bss *bss; /* Assign beacon IEs to all sub entries */ list_for_each_entry(bss, &known->hidden_list, hidden_list) { const struct cfg80211_bss_ies *ies; ies = rcu_access_pointer(bss->pub.beacon_ies); WARN_ON(ies != old_ies); rcu_assign_pointer(bss->pub.beacon_ies, new_ies); } } static bool cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *known, struct cfg80211_internal_bss *new, bool signal_valid) { lockdep_assert_held(&rdev->bss_lock); /* Update IEs */ if (rcu_access_pointer(new->pub.proberesp_ies)) { const struct cfg80211_bss_ies *old; old = rcu_access_pointer(known->pub.proberesp_ies); rcu_assign_pointer(known->pub.proberesp_ies, new->pub.proberesp_ies); /* Override possible earlier Beacon frame IEs */ rcu_assign_pointer(known->pub.ies, new->pub.proberesp_ies); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } else if (rcu_access_pointer(new->pub.beacon_ies)) { const struct cfg80211_bss_ies *old; if (known->pub.hidden_beacon_bss && !list_empty(&known->hidden_list)) { const struct cfg80211_bss_ies *f; /* The known BSS struct is one of the probe * response members of a group, but we're * receiving a beacon (beacon_ies in the new * bss is used). This can only mean that the * AP changed its beacon from not having an * SSID to showing it, which is confusing so * drop this information. */ f = rcu_access_pointer(new->pub.beacon_ies); kfree_rcu((struct cfg80211_bss_ies *)f, rcu_head); return false; } old = rcu_access_pointer(known->pub.beacon_ies); rcu_assign_pointer(known->pub.beacon_ies, new->pub.beacon_ies); /* Override IEs if they were from a beacon before */ if (old == rcu_access_pointer(known->pub.ies)) rcu_assign_pointer(known->pub.ies, new->pub.beacon_ies); cfg80211_update_hidden_bsses(known, rcu_access_pointer(new->pub.beacon_ies), old); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } known->pub.beacon_interval = new->pub.beacon_interval; /* don't update the signal if beacon was heard on * adjacent channel. */ if (signal_valid) known->pub.signal = new->pub.signal; known->pub.capability = new->pub.capability; known->ts = new->ts; known->ts_boottime = new->ts_boottime; known->parent_tsf = new->parent_tsf; known->pub.chains = new->pub.chains; memcpy(known->pub.chain_signal, new->pub.chain_signal, IEEE80211_MAX_CHAINS); ether_addr_copy(known->parent_bssid, new->parent_bssid); known->pub.max_bssid_indicator = new->pub.max_bssid_indicator; known->pub.bssid_index = new->pub.bssid_index; return true; } /* Returned bss is reference counted and must be cleaned up appropriately. */ struct cfg80211_internal_bss * cfg80211_bss_update(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *tmp, bool signal_valid, unsigned long ts) { struct cfg80211_internal_bss *found = NULL; if (WARN_ON(!tmp->pub.channel)) return NULL; tmp->ts = ts; spin_lock_bh(&rdev->bss_lock); if (WARN_ON(!rcu_access_pointer(tmp->pub.ies))) { spin_unlock_bh(&rdev->bss_lock); return NULL; } found = rb_find_bss(rdev, tmp, BSS_CMP_REGULAR); if (found) { if (!cfg80211_update_known_bss(rdev, found, tmp, signal_valid)) goto drop; } else { struct cfg80211_internal_bss *new; struct cfg80211_internal_bss *hidden; struct cfg80211_bss_ies *ies; /* * create a copy -- the "res" variable that is passed in * is allocated on the stack since it's not needed in the * more common case of an update */ new = kzalloc(sizeof(*new) + rdev->wiphy.bss_priv_size, GFP_ATOMIC); if (!new) { ies = (void *)rcu_dereference(tmp->pub.beacon_ies); if (ies) kfree_rcu(ies, rcu_head); ies = (void *)rcu_dereference(tmp->pub.proberesp_ies); if (ies) kfree_rcu(ies, rcu_head); goto drop; } memcpy(new, tmp, sizeof(*new)); new->refcount = 1; INIT_LIST_HEAD(&new->hidden_list); INIT_LIST_HEAD(&new->pub.nontrans_list); /* we'll set this later if it was non-NULL */ new->pub.transmitted_bss = NULL; if (rcu_access_pointer(tmp->pub.proberesp_ies)) { hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_ZLEN); if (!hidden) hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_NUL); if (hidden) { new->pub.hidden_beacon_bss = &hidden->pub; list_add(&new->hidden_list, &hidden->hidden_list); hidden->refcount++; ies = (void *)rcu_access_pointer(new->pub.beacon_ies); rcu_assign_pointer(new->pub.beacon_ies, hidden->pub.beacon_ies); if (ies) kfree_rcu(ies, rcu_head); } } else { /* * Ok so we found a beacon, and don't have an entry. If * it's a beacon with hidden SSID, we might be in for an * expensive search for any probe responses that should * be grouped with this beacon for updates ... */ if (!cfg80211_combine_bsses(rdev, new)) { bss_ref_put(rdev, new); goto drop; } } if (rdev->bss_entries >= bss_entries_limit && !cfg80211_bss_expire_oldest(rdev)) { bss_ref_put(rdev, new); goto drop; } /* This must be before the call to bss_ref_get */ if (tmp->pub.transmitted_bss) { struct cfg80211_internal_bss *pbss = container_of(tmp->pub.transmitted_bss, struct cfg80211_internal_bss, pub); new->pub.transmitted_bss = tmp->pub.transmitted_bss; bss_ref_get(rdev, pbss); } cfg80211_insert_bss(rdev, new); found = new; } rdev->bss_generation++; bss_ref_get(rdev, found); spin_unlock_bh(&rdev->bss_lock); return found; drop: spin_unlock_bh(&rdev->bss_lock); return NULL; } /* * Update RX channel information based on the available frame payload * information. This is mainly for the 2.4 GHz band where frames can be received * from neighboring channels and the Beacon frames use the DSSS Parameter Set * element to indicate the current (transmitting) channel, but this might also * be needed on other bands if RX frequency does not match with the actual * operating channel of a BSS. */ static struct ieee80211_channel * cfg80211_get_bss_channel(struct wiphy *wiphy, const u8 *ie, size_t ielen, struct ieee80211_channel *channel, enum nl80211_bss_scan_width scan_width) { const u8 *tmp; u32 freq; int channel_number = -1; struct ieee80211_channel *alt_channel; if (channel->band == NL80211_BAND_S1GHZ) { tmp = cfg80211_find_ie(WLAN_EID_S1G_OPERATION, ie, ielen); if (tmp && tmp[1] >= sizeof(struct ieee80211_s1g_oper_ie)) { struct ieee80211_s1g_oper_ie *s1gop = (void *)(tmp + 2); channel_number = s1gop->primary_ch; } } else { tmp = cfg80211_find_ie(WLAN_EID_DS_PARAMS, ie, ielen); if (tmp && tmp[1] == 1) { channel_number = tmp[2]; } else { tmp = cfg80211_find_ie(WLAN_EID_HT_OPERATION, ie, ielen); if (tmp && tmp[1] >= sizeof(struct ieee80211_ht_operation)) { struct ieee80211_ht_operation *htop = (void *)(tmp + 2); channel_number = htop->primary_chan; } } } if (channel_number < 0) { /* No channel information in frame payload */ return channel; } freq = ieee80211_channel_to_freq_khz(channel_number, channel->band); alt_channel = ieee80211_get_channel_khz(wiphy, freq); if (!alt_channel) { if (channel->band == NL80211_BAND_2GHZ) { /* * Better not allow unexpected channels when that could * be going beyond the 1-11 range (e.g., discovering * BSS on channel 12 when radio is configured for * channel 11. */ return NULL; } /* No match for the payload channel number - ignore it */ return channel; } if (scan_width == NL80211_BSS_CHAN_WIDTH_10 || scan_width == NL80211_BSS_CHAN_WIDTH_5) { /* * Ignore channel number in 5 and 10 MHz channels where there * may not be an n:1 or 1:n mapping between frequencies and * channel numbers. */ return channel; } /* * Use the channel determined through the payload channel number * instead of the RX channel reported by the driver. */ if (alt_channel->flags & IEEE80211_CHAN_DISABLED) return NULL; return alt_channel; } /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_bss * cfg80211_inform_single_bss_data(struct wiphy *wiphy, struct cfg80211_inform_bss *data, enum cfg80211_bss_frame_type ftype, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, struct cfg80211_non_tx_bss *non_tx_data, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_bss_ies *ies; struct ieee80211_channel *channel; struct cfg80211_internal_bss tmp = {}, *res; int bss_type; bool signal_valid; unsigned long ts; if (WARN_ON(!wiphy)) return NULL; if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC && (data->signal < 0 || data->signal > 100))) return NULL; channel = cfg80211_get_bss_channel(wiphy, ie, ielen, data->chan, data->scan_width); if (!channel) return NULL; memcpy(tmp.pub.bssid, bssid, ETH_ALEN); tmp.pub.channel = channel; tmp.pub.scan_width = data->scan_width; tmp.pub.signal = data->signal; tmp.pub.beacon_interval = beacon_interval; tmp.pub.capability = capability; tmp.ts_boottime = data->boottime_ns; tmp.parent_tsf = data->parent_tsf; ether_addr_copy(tmp.parent_bssid, data->parent_bssid); if (non_tx_data) { tmp.pub.transmitted_bss = non_tx_data->tx_bss; ts = bss_from_pub(non_tx_data->tx_bss)->ts; tmp.pub.bssid_index = non_tx_data->bssid_index; tmp.pub.max_bssid_indicator = non_tx_data->max_bssid_indicator; } else { ts = jiffies; } /* * If we do not know here whether the IEs are from a Beacon or Probe * Response frame, we need to pick one of the options and only use it * with the driver that does not provide the full Beacon/Probe Response * frame. Use Beacon frame pointer to avoid indicating that this should * override the IEs pointer should we have received an earlier * indication of Probe Response data. */ ies = kzalloc(sizeof(*ies) + ielen, gfp); if (!ies) return NULL; ies->len = ielen; ies->tsf = tsf; ies->from_beacon = false; memcpy(ies->data, ie, ielen); switch (ftype) { case CFG80211_BSS_FTYPE_BEACON: ies->from_beacon = true; fallthrough; case CFG80211_BSS_FTYPE_UNKNOWN: rcu_assign_pointer(tmp.pub.beacon_ies, ies); break; case CFG80211_BSS_FTYPE_PRESP: rcu_assign_pointer(tmp.pub.proberesp_ies, ies); break; } rcu_assign_pointer(tmp.pub.ies, ies); signal_valid = data->chan == channel; res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid, ts); if (!res) return NULL; if (channel->band == NL80211_BAND_60GHZ) { bss_type = res->pub.capability & WLAN_CAPABILITY_DMG_TYPE_MASK; if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP || bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS) regulatory_hint_found_beacon(wiphy, channel, gfp); } else { if (res->pub.capability & WLAN_CAPABILITY_ESS) regulatory_hint_found_beacon(wiphy, channel, gfp); } if (non_tx_data) { /* this is a nontransmitting bss, we need to add it to * transmitting bss' list if it is not there */ spin_lock_bh(&rdev->bss_lock); if (cfg80211_add_nontrans_list(non_tx_data->tx_bss, &res->pub)) { if (__cfg80211_unlink_bss(rdev, res)) { rdev->bss_generation++; res = NULL; } } spin_unlock_bh(&rdev->bss_lock); if (!res) return NULL; } trace_cfg80211_return_bss(&res->pub); /* cfg80211_bss_update gives us a referenced result */ return &res->pub; } static const struct element *cfg80211_get_profile_continuation(const u8 *ie, size_t ielen, const struct element *mbssid_elem, const struct element *sub_elem) { const u8 *mbssid_end = mbssid_elem->data + mbssid_elem->datalen; const struct element *next_mbssid; const struct element *next_sub; next_mbssid = cfg80211_find_elem(WLAN_EID_MULTIPLE_BSSID, mbssid_end, ielen - (mbssid_end - ie)); /* * If it is not the last subelement in current MBSSID IE or there isn't * a next MBSSID IE - profile is complete. */ if ((sub_elem->data + sub_elem->datalen < mbssid_end - 1) || !next_mbssid) return NULL; /* For any length error, just return NULL */ if (next_mbssid->datalen < 4) return NULL; next_sub = (void *)&next_mbssid->data[1]; if (next_mbssid->data + next_mbssid->datalen < next_sub->data + next_sub->datalen) return NULL; if (next_sub->id != 0 || next_sub->datalen < 2) return NULL; /* * Check if the first element in the next sub element is a start * of a new profile */ return next_sub->data[0] == WLAN_EID_NON_TX_BSSID_CAP ? NULL : next_mbssid; } size_t cfg80211_merge_profile(const u8 *ie, size_t ielen, const struct element *mbssid_elem, const struct element *sub_elem, u8 *merged_ie, size_t max_copy_len) { size_t copied_len = sub_elem->datalen; const struct element *next_mbssid; if (sub_elem->datalen > max_copy_len) return 0; memcpy(merged_ie, sub_elem->data, sub_elem->datalen); while ((next_mbssid = cfg80211_get_profile_continuation(ie, ielen, mbssid_elem, sub_elem))) { const struct element *next_sub = (void *)&next_mbssid->data[1]; if (copied_len + next_sub->datalen > max_copy_len) break; memcpy(merged_ie + copied_len, next_sub->data, next_sub->datalen); copied_len += next_sub->datalen; } return copied_len; } EXPORT_SYMBOL(cfg80211_merge_profile); static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, struct cfg80211_inform_bss *data, enum cfg80211_bss_frame_type ftype, const u8 *bssid, u64 tsf, u16 beacon_interval, const u8 *ie, size_t ielen, struct cfg80211_non_tx_bss *non_tx_data, gfp_t gfp) { const u8 *mbssid_index_ie; const struct element *elem, *sub; size_t new_ie_len; u8 new_bssid[ETH_ALEN]; u8 *new_ie, *profile; u64 seen_indices = 0; u16 capability; struct cfg80211_bss *bss; if (!non_tx_data) return; if (!cfg80211_find_ie(WLAN_EID_MULTIPLE_BSSID, ie, ielen)) return; if (!wiphy->support_mbssid) return; if (wiphy->support_only_he_mbssid && !cfg80211_find_ext_ie(WLAN_EID_EXT_HE_CAPABILITY, ie, ielen)) return; new_ie = kmalloc(IEEE80211_MAX_DATA_LEN, gfp); if (!new_ie) return; profile = kmalloc(ielen, gfp); if (!profile) goto out; for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, ie, ielen) { if (elem->datalen < 4) continue; if (elem->data[0] < 1 || (int)elem->data[0] > 8) continue; for_each_element(sub, elem->data + 1, elem->datalen - 1) { u8 profile_len; if (sub->id != 0 || sub->datalen < 4) { /* not a valid BSS profile */ continue; } if (sub->data[0] != WLAN_EID_NON_TX_BSSID_CAP || sub->data[1] != 2) { /* The first element within the Nontransmitted * BSSID Profile is not the Nontransmitted * BSSID Capability element. */ continue; } memset(profile, 0, ielen); profile_len = cfg80211_merge_profile(ie, ielen, elem, sub, profile, ielen); /* found a Nontransmitted BSSID Profile */ mbssid_index_ie = cfg80211_find_ie (WLAN_EID_MULTI_BSSID_IDX, profile, profile_len); if (!mbssid_index_ie || mbssid_index_ie[1] < 1 || mbssid_index_ie[2] == 0 || mbssid_index_ie[2] > 46) { /* No valid Multiple BSSID-Index element */ continue; } if (seen_indices & BIT_ULL(mbssid_index_ie[2])) /* We don't support legacy split of a profile */ net_dbg_ratelimited("Partial info for BSSID index %d\n", mbssid_index_ie[2]); seen_indices |= BIT_ULL(mbssid_index_ie[2]); non_tx_data->bssid_index = mbssid_index_ie[2]; non_tx_data->max_bssid_indicator = elem->data[0]; cfg80211_gen_new_bssid(bssid, non_tx_data->max_bssid_indicator, non_tx_data->bssid_index, new_bssid); memset(new_ie, 0, IEEE80211_MAX_DATA_LEN); new_ie_len = cfg80211_gen_new_ie(ie, ielen, profile, profile_len, new_ie, IEEE80211_MAX_DATA_LEN); if (!new_ie_len) continue; capability = get_unaligned_le16(profile + 2); bss = cfg80211_inform_single_bss_data(wiphy, data, ftype, new_bssid, tsf, capability, beacon_interval, new_ie, new_ie_len, non_tx_data, gfp); if (!bss) break; cfg80211_put_bss(wiphy, bss); } } out: kfree(new_ie); kfree(profile); } struct cfg80211_bss * cfg80211_inform_bss_data(struct wiphy *wiphy, struct cfg80211_inform_bss *data, enum cfg80211_bss_frame_type ftype, const u8 *bssid, u64 tsf, u16 capability, u16 beacon_interval, const u8 *ie, size_t ielen, gfp_t gfp) { struct cfg80211_bss *res; struct cfg80211_non_tx_bss non_tx_data; res = cfg80211_inform_single_bss_data(wiphy, data, ftype, bssid, tsf, capability, beacon_interval, ie, ielen, NULL, gfp); if (!res) return NULL; non_tx_data.tx_bss = res; cfg80211_parse_mbssid_data(wiphy, data, ftype, bssid, tsf, beacon_interval, ie, ielen, &non_tx_data, gfp); return res; } EXPORT_SYMBOL(cfg80211_inform_bss_data); static void cfg80211_parse_mbssid_frame_data(struct wiphy *wiphy, struct cfg80211_inform_bss *data, struct ieee80211_mgmt *mgmt, size_t len, struct cfg80211_non_tx_bss *non_tx_data, gfp_t gfp) { enum cfg80211_bss_frame_type ftype; const u8 *ie = mgmt->u.probe_resp.variable; size_t ielen = len - offsetof(struct ieee80211_mgmt, u.probe_resp.variable); ftype = ieee80211_is_beacon(mgmt->frame_control) ? CFG80211_BSS_FTYPE_BEACON : CFG80211_BSS_FTYPE_PRESP; cfg80211_parse_mbssid_data(wiphy, data, ftype, mgmt->bssid, le64_to_cpu(mgmt->u.probe_resp.timestamp), le16_to_cpu(mgmt->u.probe_resp.beacon_int), ie, ielen, non_tx_data, gfp); } static void cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, struct cfg80211_bss *nontrans_bss, struct ieee80211_mgmt *mgmt, size_t len) { u8 *ie, *new_ie, *pos; const u8 *nontrans_ssid, *trans_ssid, *mbssid; size_t ielen = len - offsetof(struct ieee80211_mgmt, u.probe_resp.variable); size_t new_ie_len; struct cfg80211_bss_ies *new_ies; const struct cfg80211_bss_ies *old; size_t cpy_len; lockdep_assert_held(&wiphy_to_rdev(wiphy)->bss_lock); ie = mgmt->u.probe_resp.variable; new_ie_len = ielen; trans_ssid = cfg80211_find_ie(WLAN_EID_SSID, ie, ielen); if (!trans_ssid) return; new_ie_len -= trans_ssid[1]; mbssid = cfg80211_find_ie(WLAN_EID_MULTIPLE_BSSID, ie, ielen); /* * It's not valid to have the MBSSID element before SSID * ignore if that happens - the code below assumes it is * after (while copying things inbetween). */ if (!mbssid || mbssid < trans_ssid) return; new_ie_len -= mbssid[1]; nontrans_ssid = ieee80211_bss_get_ie(nontrans_bss, WLAN_EID_SSID); if (!nontrans_ssid) return; new_ie_len += nontrans_ssid[1]; /* generate new ie for nontrans BSS * 1. replace SSID with nontrans BSS' SSID * 2. skip MBSSID IE */ new_ie = kzalloc(new_ie_len, GFP_ATOMIC); if (!new_ie) return; new_ies = kzalloc(sizeof(*new_ies) + new_ie_len, GFP_ATOMIC); if (!new_ies) goto out_free; pos = new_ie; /* copy the nontransmitted SSID */ cpy_len = nontrans_ssid[1] + 2; memcpy(pos, nontrans_ssid, cpy_len); pos += cpy_len; /* copy the IEs between SSID and MBSSID */ cpy_len = trans_ssid[1] + 2; memcpy(pos, (trans_ssid + cpy_len), (mbssid - (trans_ssid + cpy_len))); pos += (mbssid - (trans_ssid + cpy_len)); /* copy the IEs after MBSSID */ cpy_len = mbssid[1] + 2; memcpy(pos, mbssid + cpy_len, ((ie + ielen) - (mbssid + cpy_len))); /* update ie */ new_ies->len = new_ie_len; new_ies->tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); new_ies->from_beacon = ieee80211_is_beacon(mgmt->frame_control); memcpy(new_ies->data, new_ie, new_ie_len); if (ieee80211_is_probe_resp(mgmt->frame_control)) { old = rcu_access_pointer(nontrans_bss->proberesp_ies); rcu_assign_pointer(nontrans_bss->proberesp_ies, new_ies); rcu_assign_pointer(nontrans_bss->ies, new_ies); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } else { old = rcu_access_pointer(nontrans_bss->beacon_ies); rcu_assign_pointer(nontrans_bss->beacon_ies, new_ies); cfg80211_update_hidden_bsses(bss_from_pub(nontrans_bss), new_ies, old); rcu_assign_pointer(nontrans_bss->ies, new_ies); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } out_free: kfree(new_ie); } /* cfg80211_inform_bss_width_frame helper */ static struct cfg80211_bss * cfg80211_inform_single_bss_frame_data(struct wiphy *wiphy, struct cfg80211_inform_bss *data, struct ieee80211_mgmt *mgmt, size_t len, gfp_t gfp) { struct cfg80211_internal_bss tmp = {}, *res; struct cfg80211_bss_ies *ies; struct ieee80211_channel *channel; bool signal_valid; struct ieee80211_ext *ext = NULL; u8 *bssid, *variable; u16 capability, beacon_int; size_t ielen, min_hdr_len = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); int bss_type; BUILD_BUG_ON(offsetof(struct ieee80211_mgmt, u.probe_resp.variable) != offsetof(struct ieee80211_mgmt, u.beacon.variable)); trace_cfg80211_inform_bss_frame(wiphy, data, mgmt, len); if (WARN_ON(!mgmt)) return NULL; if (WARN_ON(!wiphy)) return NULL; if (WARN_ON(wiphy->signal_type == CFG80211_SIGNAL_TYPE_UNSPEC && (data->signal < 0 || data->signal > 100))) return NULL; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { ext = (void *) mgmt; min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon); if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_short_beacon.variable); } if (WARN_ON(len < min_hdr_len)) return NULL; ielen = len - min_hdr_len; variable = mgmt->u.probe_resp.variable; if (ext) { if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) variable = ext->u.s1g_short_beacon.variable; else variable = ext->u.s1g_beacon.variable; } channel = cfg80211_get_bss_channel(wiphy, variable, ielen, data->chan, data->scan_width); if (!channel) return NULL; if (ext) { const struct ieee80211_s1g_bcn_compat_ie *compat; const struct element *elem; elem = cfg80211_find_elem(WLAN_EID_S1G_BCN_COMPAT, variable, ielen); if (!elem) return NULL; if (elem->datalen < sizeof(*compat)) return NULL; compat = (void *)elem->data; bssid = ext->u.s1g_beacon.sa; capability = le16_to_cpu(compat->compat_info); beacon_int = le16_to_cpu(compat->beacon_int); } else { bssid = mgmt->bssid; beacon_int = le16_to_cpu(mgmt->u.probe_resp.beacon_int); capability = le16_to_cpu(mgmt->u.probe_resp.capab_info); } ies = kzalloc(sizeof(*ies) + ielen, gfp); if (!ies) return NULL; ies->len = ielen; ies->tsf = le64_to_cpu(mgmt->u.probe_resp.timestamp); ies->from_beacon = ieee80211_is_beacon(mgmt->frame_control) || ieee80211_is_s1g_beacon(mgmt->frame_control); memcpy(ies->data, variable, ielen); if (ieee80211_is_probe_resp(mgmt->frame_control)) rcu_assign_pointer(tmp.pub.proberesp_ies, ies); else rcu_assign_pointer(tmp.pub.beacon_ies, ies); rcu_assign_pointer(tmp.pub.ies, ies); memcpy(tmp.pub.bssid, bssid, ETH_ALEN); tmp.pub.beacon_interval = beacon_int; tmp.pub.capability = capability; tmp.pub.channel = channel; tmp.pub.scan_width = data->scan_width; tmp.pub.signal = data->signal; tmp.ts_boottime = data->boottime_ns; tmp.parent_tsf = data->parent_tsf; tmp.pub.chains = data->chains; memcpy(tmp.pub.chain_signal, data->chain_signal, IEEE80211_MAX_CHAINS); ether_addr_copy(tmp.parent_bssid, data->parent_bssid); signal_valid = data->chan == channel; res = cfg80211_bss_update(wiphy_to_rdev(wiphy), &tmp, signal_valid, jiffies); if (!res) return NULL; if (channel->band == NL80211_BAND_60GHZ) { bss_type = res->pub.capability & WLAN_CAPABILITY_DMG_TYPE_MASK; if (bss_type == WLAN_CAPABILITY_DMG_TYPE_AP || bss_type == WLAN_CAPABILITY_DMG_TYPE_PBSS) regulatory_hint_found_beacon(wiphy, channel, gfp); } else { if (res->pub.capability & WLAN_CAPABILITY_ESS) regulatory_hint_found_beacon(wiphy, channel, gfp); } trace_cfg80211_return_bss(&res->pub); /* cfg80211_bss_update gives us a referenced result */ return &res->pub; } struct cfg80211_bss * cfg80211_inform_bss_frame_data(struct wiphy *wiphy, struct cfg80211_inform_bss *data, struct ieee80211_mgmt *mgmt, size_t len, gfp_t gfp) { struct cfg80211_bss *res, *tmp_bss; const u8 *ie = mgmt->u.probe_resp.variable; const struct cfg80211_bss_ies *ies1, *ies2; size_t ielen = len - offsetof(struct ieee80211_mgmt, u.probe_resp.variable); struct cfg80211_non_tx_bss non_tx_data = {}; res = cfg80211_inform_single_bss_frame_data(wiphy, data, mgmt, len, gfp); /* don't do any further MBSSID handling for S1G */ if (ieee80211_is_s1g_beacon(mgmt->frame_control)) return res; if (!res || !wiphy->support_mbssid || !cfg80211_find_ie(WLAN_EID_MULTIPLE_BSSID, ie, ielen)) return res; if (wiphy->support_only_he_mbssid && !cfg80211_find_ext_ie(WLAN_EID_EXT_HE_CAPABILITY, ie, ielen)) return res; non_tx_data.tx_bss = res; /* process each non-transmitting bss */ cfg80211_parse_mbssid_frame_data(wiphy, data, mgmt, len, &non_tx_data, gfp); spin_lock_bh(&wiphy_to_rdev(wiphy)->bss_lock); /* check if the res has other nontransmitting bss which is not * in MBSSID IE */ ies1 = rcu_access_pointer(res->ies); /* go through nontrans_list, if the timestamp of the BSS is * earlier than the timestamp of the transmitting BSS then * update it */ list_for_each_entry(tmp_bss, &res->nontrans_list, nontrans_list) { ies2 = rcu_access_pointer(tmp_bss->ies); if (ies2->tsf < ies1->tsf) cfg80211_update_notlisted_nontrans(wiphy, tmp_bss, mgmt, len); } spin_unlock_bh(&wiphy_to_rdev(wiphy)->bss_lock); return res; } EXPORT_SYMBOL(cfg80211_inform_bss_frame_data); void cfg80211_ref_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss; if (!pub) return; bss = container_of(pub, struct cfg80211_internal_bss, pub); spin_lock_bh(&rdev->bss_lock); bss_ref_get(rdev, bss); spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_ref_bss); void cfg80211_put_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss; if (!pub) return; bss = container_of(pub, struct cfg80211_internal_bss, pub); spin_lock_bh(&rdev->bss_lock); bss_ref_put(rdev, bss); spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_put_bss); void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss, *tmp1; struct cfg80211_bss *nontrans_bss, *tmp; if (WARN_ON(!pub)) return; bss = container_of(pub, struct cfg80211_internal_bss, pub); spin_lock_bh(&rdev->bss_lock); if (list_empty(&bss->list)) goto out; list_for_each_entry_safe(nontrans_bss, tmp, &pub->nontrans_list, nontrans_list) { tmp1 = container_of(nontrans_bss, struct cfg80211_internal_bss, pub); if (__cfg80211_unlink_bss(rdev, tmp1)) rdev->bss_generation++; } if (__cfg80211_unlink_bss(rdev, bss)) rdev->bss_generation++; out: spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_unlink_bss); void cfg80211_bss_iter(struct wiphy *wiphy, struct cfg80211_chan_def *chandef, void (*iter)(struct wiphy *wiphy, struct cfg80211_bss *bss, void *data), void *iter_data) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *bss; spin_lock_bh(&rdev->bss_lock); list_for_each_entry(bss, &rdev->bss_list, list) { if (!chandef || cfg80211_is_sub_chan(chandef, bss->pub.channel)) iter(wiphy, &bss->pub, iter_data); } spin_unlock_bh(&rdev->bss_lock); } EXPORT_SYMBOL(cfg80211_bss_iter); void cfg80211_update_assoc_bss_entry(struct wireless_dev *wdev, struct ieee80211_channel *chan) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_internal_bss *cbss = wdev->current_bss; struct cfg80211_internal_bss *new = NULL; struct cfg80211_internal_bss *bss; struct cfg80211_bss *nontrans_bss; struct cfg80211_bss *tmp; spin_lock_bh(&rdev->bss_lock); /* * Some APs use CSA also for bandwidth changes, i.e., without actually * changing the control channel, so no need to update in such a case. */ if (cbss->pub.channel == chan) goto done; /* use transmitting bss */ if (cbss->pub.transmitted_bss) cbss = container_of(cbss->pub.transmitted_bss, struct cfg80211_internal_bss, pub); cbss->pub.channel = chan; list_for_each_entry(bss, &rdev->bss_list, list) { if (!cfg80211_bss_type_match(bss->pub.capability, bss->pub.channel->band, wdev->conn_bss_type)) continue; if (bss == cbss) continue; if (!cmp_bss(&bss->pub, &cbss->pub, BSS_CMP_REGULAR)) { new = bss; break; } } if (new) { /* to save time, update IEs for transmitting bss only */ if (cfg80211_update_known_bss(rdev, cbss, new, false)) { new->pub.proberesp_ies = NULL; new->pub.beacon_ies = NULL; } list_for_each_entry_safe(nontrans_bss, tmp, &new->pub.nontrans_list, nontrans_list) { bss = container_of(nontrans_bss, struct cfg80211_internal_bss, pub); if (__cfg80211_unlink_bss(rdev, bss)) rdev->bss_generation++; } WARN_ON(atomic_read(&new->hold)); if (!WARN_ON(!__cfg80211_unlink_bss(rdev, new))) rdev->bss_generation++; } cfg80211_rehash_bss(rdev, cbss); list_for_each_entry_safe(nontrans_bss, tmp, &cbss->pub.nontrans_list, nontrans_list) { bss = container_of(nontrans_bss, struct cfg80211_internal_bss, pub); bss->pub.channel = chan; cfg80211_rehash_bss(rdev, bss); } done: spin_unlock_bh(&rdev->bss_lock); } #ifdef CONFIG_CFG80211_WEXT static struct cfg80211_registered_device * cfg80211_get_dev_from_ifindex(struct net *net, int ifindex) { struct cfg80211_registered_device *rdev; struct net_device *dev; ASSERT_RTNL(); dev = dev_get_by_index(net, ifindex); if (!dev) return ERR_PTR(-ENODEV); if (dev->ieee80211_ptr) rdev = wiphy_to_rdev(dev->ieee80211_ptr->wiphy); else rdev = ERR_PTR(-ENODEV); dev_put(dev); return rdev; } int cfg80211_wext_siwscan(struct net_device *dev, struct iw_request_info *info, union iwreq_data *wrqu, char *extra) { struct cfg80211_registered_device *rdev; struct wiphy *wiphy; struct iw_scan_req *wreq = NULL; struct cfg80211_scan_request *creq = NULL; int i, err, n_channels = 0; enum nl80211_band band; if (!netif_running(dev)) return -ENETDOWN; if (wrqu->data.length == sizeof(struct iw_scan_req)) wreq = (struct iw_scan_req *)extra; rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex); if (IS_ERR(rdev)) return PTR_ERR(rdev); if (rdev->scan_req || rdev->scan_msg) { err = -EBUSY; goto out; } wiphy = &rdev->wiphy; /* Determine number of channels, needed to allocate creq */ if (wreq && wreq->num_channels) { /* Passed from userspace so should be checked */ if (unlikely(wreq->num_channels > IW_MAX_FREQUENCIES)) return -EINVAL; n_channels = wreq->num_channels; } else { n_channels = ieee80211_get_num_supported_channels(wiphy); } creq = kzalloc(struct_size(creq, channels, n_channels) + sizeof(struct cfg80211_ssid), GFP_ATOMIC); if (!creq) { err = -ENOMEM; goto out; } creq->wiphy = wiphy; creq->wdev = dev->ieee80211_ptr; /* SSIDs come after channels */ creq->ssids = (void *)creq + struct_size(creq, channels, n_channels); creq->n_channels = n_channels; creq->n_ssids = 1; creq->scan_start = jiffies; /* translate "Scan on frequencies" request */ i = 0; for (band = 0; band < NUM_NL80211_BANDS; band++) { int j; if (!wiphy->bands[band]) continue; for (j = 0; j < wiphy->bands[band]->n_channels; j++) { /* ignore disabled channels */ if (wiphy->bands[band]->channels[j].flags & IEEE80211_CHAN_DISABLED) continue; /* If we have a wireless request structure and the * wireless request specifies frequencies, then search * for the matching hardware channel. */ if (wreq && wreq->num_channels) { int k; int wiphy_freq = wiphy->bands[band]->channels[j].center_freq; for (k = 0; k < wreq->num_channels; k++) { struct iw_freq *freq = &wreq->channel_list[k]; int wext_freq = cfg80211_wext_freq(freq); if (wext_freq == wiphy_freq) goto wext_freq_found; } goto wext_freq_not_found; } wext_freq_found: creq->channels[i] = &wiphy->bands[band]->channels[j]; i++; wext_freq_not_found: ; } } /* No channels found? */ if (!i) { err = -EINVAL; goto out; } /* Set real number of channels specified in creq->channels[] */ creq->n_channels = i; /* translate "Scan for SSID" request */ if (wreq) { if (wrqu->data.flags & IW_SCAN_THIS_ESSID) { if (wreq->essid_len > IEEE80211_MAX_SSID_LEN) { err = -EINVAL; goto out; } memcpy(creq->ssids[0].ssid, wreq->essid, wreq->essid_len); creq->ssids[0].ssid_len = wreq->essid_len; } if (wreq->scan_type == IW_SCAN_TYPE_PASSIVE) creq->n_ssids = 0; } for (i = 0; i < NUM_NL80211_BANDS; i++) if (wiphy->bands[i]) creq->rates[i] = (1 << wiphy->bands[i]->n_bitrates) - 1; eth_broadcast_addr(creq->bssid); wiphy_lock(&rdev->wiphy); rdev->scan_req = creq; err = rdev_scan(rdev, creq); if (err) { rdev->scan_req = NULL; /* creq will be freed below */ } else { nl80211_send_scan_start(rdev, dev->ieee80211_ptr); /* creq now owned by driver */ creq = NULL; dev_hold(dev); } wiphy_unlock(&rdev->wiphy); out: kfree(creq); return err; } EXPORT_WEXT_HANDLER(cfg80211_wext_siwscan); static char *ieee80211_scan_add_ies(struct iw_request_info *info, const struct cfg80211_bss_ies *ies, char *current_ev, char *end_buf) { const u8 *pos, *end, *next; struct iw_event iwe; if (!ies) return current_ev; /* * If needed, fragment the IEs buffer (at IE boundaries) into short * enough fragments to fit into IW_GENERIC_IE_MAX octet messages. */ pos = ies->data; end = pos + ies->len; while (end - pos > IW_GENERIC_IE_MAX) { next = pos + 2 + pos[1]; while (next + 2 + next[1] - pos < IW_GENERIC_IE_MAX) next = next + 2 + next[1]; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVGENIE; iwe.u.data.length = next - pos; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, (void *)pos); if (IS_ERR(current_ev)) return current_ev; pos = next; } if (end > pos) { memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVGENIE; iwe.u.data.length = end - pos; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, (void *)pos); if (IS_ERR(current_ev)) return current_ev; } return current_ev; } static char * ieee80211_bss(struct wiphy *wiphy, struct iw_request_info *info, struct cfg80211_internal_bss *bss, char *current_ev, char *end_buf) { const struct cfg80211_bss_ies *ies; struct iw_event iwe; const u8 *ie; u8 buf[50]; u8 *cfg, *p, *tmp; int rem, i, sig; bool ismesh = false; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWAP; iwe.u.ap_addr.sa_family = ARPHRD_ETHER; memcpy(iwe.u.ap_addr.sa_data, bss->pub.bssid, ETH_ALEN); current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_ADDR_LEN); if (IS_ERR(current_ev)) return current_ev; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWFREQ; iwe.u.freq.m = ieee80211_frequency_to_channel(bss->pub.channel->center_freq); iwe.u.freq.e = 0; current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_FREQ_LEN); if (IS_ERR(current_ev)) return current_ev; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWFREQ; iwe.u.freq.m = bss->pub.channel->center_freq; iwe.u.freq.e = 6; current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_FREQ_LEN); if (IS_ERR(current_ev)) return current_ev; if (wiphy->signal_type != CFG80211_SIGNAL_TYPE_NONE) { memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVQUAL; iwe.u.qual.updated = IW_QUAL_LEVEL_UPDATED | IW_QUAL_NOISE_INVALID | IW_QUAL_QUAL_UPDATED; switch (wiphy->signal_type) { case CFG80211_SIGNAL_TYPE_MBM: sig = bss->pub.signal / 100; iwe.u.qual.level = sig; iwe.u.qual.updated |= IW_QUAL_DBM; if (sig < -110) /* rather bad */ sig = -110; else if (sig > -40) /* perfect */ sig = -40; /* will give a range of 0 .. 70 */ iwe.u.qual.qual = sig + 110; break; case CFG80211_SIGNAL_TYPE_UNSPEC: iwe.u.qual.level = bss->pub.signal; /* will give range 0 .. 100 */ iwe.u.qual.qual = bss->pub.signal; break; default: /* not reached */ break; } current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_QUAL_LEN); if (IS_ERR(current_ev)) return current_ev; } memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWENCODE; if (bss->pub.capability & WLAN_CAPABILITY_PRIVACY) iwe.u.data.flags = IW_ENCODE_ENABLED | IW_ENCODE_NOKEY; else iwe.u.data.flags = IW_ENCODE_DISABLED; iwe.u.data.length = 0; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, ""); if (IS_ERR(current_ev)) return current_ev; rcu_read_lock(); ies = rcu_dereference(bss->pub.ies); rem = ies->len; ie = ies->data; while (rem >= 2) { /* invalid data */ if (ie[1] > rem - 2) break; switch (ie[0]) { case WLAN_EID_SSID: memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWESSID; iwe.u.data.length = ie[1]; iwe.u.data.flags = 1; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, (u8 *)ie + 2); if (IS_ERR(current_ev)) goto unlock; break; case WLAN_EID_MESH_ID: memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWESSID; iwe.u.data.length = ie[1]; iwe.u.data.flags = 1; current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, (u8 *)ie + 2); if (IS_ERR(current_ev)) goto unlock; break; case WLAN_EID_MESH_CONFIG: ismesh = true; if (ie[1] != sizeof(struct ieee80211_meshconf_ie)) break; cfg = (u8 *)ie + 2; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVCUSTOM; sprintf(buf, "Mesh Network Path Selection Protocol ID: " "0x%02X", cfg[0]); iwe.u.data.length = strlen(buf); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; sprintf(buf, "Path Selection Metric ID: 0x%02X", cfg[1]); iwe.u.data.length = strlen(buf); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; sprintf(buf, "Congestion Control Mode ID: 0x%02X", cfg[2]); iwe.u.data.length = strlen(buf); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; sprintf(buf, "Synchronization ID: 0x%02X", cfg[3]); iwe.u.data.length = strlen(buf); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; sprintf(buf, "Authentication ID: 0x%02X", cfg[4]); iwe.u.data.length = strlen(buf); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; sprintf(buf, "Formation Info: 0x%02X", cfg[5]); iwe.u.data.length = strlen(buf); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; sprintf(buf, "Capabilities: 0x%02X", cfg[6]); iwe.u.data.length = strlen(buf); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; break; case WLAN_EID_SUPP_RATES: case WLAN_EID_EXT_SUPP_RATES: /* display all supported rates in readable format */ p = current_ev + iwe_stream_lcp_len(info); memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWRATE; /* Those two flags are ignored... */ iwe.u.bitrate.fixed = iwe.u.bitrate.disabled = 0; for (i = 0; i < ie[1]; i++) { iwe.u.bitrate.value = ((ie[i + 2] & 0x7f) * 500000); tmp = p; p = iwe_stream_add_value(info, current_ev, p, end_buf, &iwe, IW_EV_PARAM_LEN); if (p == tmp) { current_ev = ERR_PTR(-E2BIG); goto unlock; } } current_ev = p; break; } rem -= ie[1] + 2; ie += ie[1] + 2; } if (bss->pub.capability & (WLAN_CAPABILITY_ESS | WLAN_CAPABILITY_IBSS) || ismesh) { memset(&iwe, 0, sizeof(iwe)); iwe.cmd = SIOCGIWMODE; if (ismesh) iwe.u.mode = IW_MODE_MESH; else if (bss->pub.capability & WLAN_CAPABILITY_ESS) iwe.u.mode = IW_MODE_MASTER; else iwe.u.mode = IW_MODE_ADHOC; current_ev = iwe_stream_add_event_check(info, current_ev, end_buf, &iwe, IW_EV_UINT_LEN); if (IS_ERR(current_ev)) goto unlock; } memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVCUSTOM; sprintf(buf, "tsf=%016llx", (unsigned long long)(ies->tsf)); iwe.u.data.length = strlen(buf); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; memset(&iwe, 0, sizeof(iwe)); iwe.cmd = IWEVCUSTOM; sprintf(buf, " Last beacon: %ums ago", elapsed_jiffies_msecs(bss->ts)); iwe.u.data.length = strlen(buf); current_ev = iwe_stream_add_point_check(info, current_ev, end_buf, &iwe, buf); if (IS_ERR(current_ev)) goto unlock; current_ev = ieee80211_scan_add_ies(info, ies, current_ev, end_buf); unlock: rcu_read_unlock(); return current_ev; } static int ieee80211_scan_results(struct cfg80211_registered_device *rdev, struct iw_request_info *info, char *buf, size_t len) { char *current_ev = buf; char *end_buf = buf + len; struct cfg80211_internal_bss *bss; int err = 0; spin_lock_bh(&rdev->bss_lock); cfg80211_bss_expire(rdev); list_for_each_entry(bss, &rdev->bss_list, list) { if (buf + len - current_ev <= IW_EV_ADDR_LEN) { err = -E2BIG; break; } current_ev = ieee80211_bss(&rdev->wiphy, info, bss, current_ev, end_buf); if (IS_ERR(current_ev)) { err = PTR_ERR(current_ev); break; } } spin_unlock_bh(&rdev->bss_lock); if (err) return err; return current_ev - buf; } int cfg80211_wext_giwscan(struct net_device *dev, struct iw_request_info *info, struct iw_point *data, char *extra) { struct cfg80211_registered_device *rdev; int res; if (!netif_running(dev)) return -ENETDOWN; rdev = cfg80211_get_dev_from_ifindex(dev_net(dev), dev->ifindex); if (IS_ERR(rdev)) return PTR_ERR(rdev); if (rdev->scan_req || rdev->scan_msg) return -EAGAIN; res = ieee80211_scan_results(rdev, info, extra, data->length); data->length = 0; if (res >= 0) { data->length = res; res = 0; } return res; } EXPORT_WEXT_HANDLER(cfg80211_wext_giwscan); #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/condition.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* List of "struct tomoyo_condition". */ LIST_HEAD(tomoyo_condition_list); /** * tomoyo_argv - Check argv[] in "struct linux_binbrm". * * @index: Index number of @arg_ptr. * @arg_ptr: Contents of argv[@index]. * @argc: Length of @argv. * @argv: Pointer to "struct tomoyo_argv". * @checked: Set to true if @argv[@index] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_argv(const unsigned int index, const char *arg_ptr, const int argc, const struct tomoyo_argv *argv, u8 *checked) { int i; struct tomoyo_path_info arg; arg.name = arg_ptr; for (i = 0; i < argc; argv++, checked++, i++) { bool result; if (index != argv->index) continue; *checked = 1; tomoyo_fill_path_info(&arg); result = tomoyo_path_matches_pattern(&arg, argv->value); if (argv->is_not) result = !result; if (!result) return false; } return true; } /** * tomoyo_envp - Check envp[] in "struct linux_binbrm". * * @env_name: The name of environment variable. * @env_value: The value of environment variable. * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * @checked: Set to true if @envp[@env_name] was found. * * Returns true on success, false otherwise. */ static bool tomoyo_envp(const char *env_name, const char *env_value, const int envc, const struct tomoyo_envp *envp, u8 *checked) { int i; struct tomoyo_path_info name; struct tomoyo_path_info value; name.name = env_name; tomoyo_fill_path_info(&name); value.name = env_value; tomoyo_fill_path_info(&value); for (i = 0; i < envc; envp++, checked++, i++) { bool result; if (!tomoyo_path_matches_pattern(&name, envp->name)) continue; *checked = 1; if (envp->value) { result = tomoyo_path_matches_pattern(&value, envp->value); if (envp->is_not) result = !result; } else { result = true; if (!envp->is_not) result = !result; } if (!result) return false; } return true; } /** * tomoyo_scan_bprm - Scan "struct linux_binprm". * * @ee: Pointer to "struct tomoyo_execve". * @argc: Length of @argc. * @argv: Pointer to "struct tomoyo_argv". * @envc: Length of @envp. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_bprm(struct tomoyo_execve *ee, const u16 argc, const struct tomoyo_argv *argv, const u16 envc, const struct tomoyo_envp *envp) { struct linux_binprm *bprm = ee->bprm; struct tomoyo_page_dump *dump = &ee->dump; char *arg_ptr = ee->tmp; int arg_len = 0; unsigned long pos = bprm->p; int offset = pos % PAGE_SIZE; int argv_count = bprm->argc; int envp_count = bprm->envc; bool result = true; u8 local_checked[32]; u8 *checked; if (argc + envc <= sizeof(local_checked)) { checked = local_checked; memset(local_checked, 0, sizeof(local_checked)); } else { checked = kzalloc(argc + envc, GFP_NOFS); if (!checked) return false; } while (argv_count || envp_count) { if (!tomoyo_dump_page(bprm, pos, dump)) { result = false; goto out; } pos += PAGE_SIZE - offset; while (offset < PAGE_SIZE) { /* Read. */ const char *kaddr = dump->data; const unsigned char c = kaddr[offset++]; if (c && arg_len < TOMOYO_EXEC_TMPSIZE - 10) { if (c == '\\') { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = '\\'; } else if (c > ' ' && c < 127) { arg_ptr[arg_len++] = c; } else { arg_ptr[arg_len++] = '\\'; arg_ptr[arg_len++] = (c >> 6) + '0'; arg_ptr[arg_len++] = ((c >> 3) & 7) + '0'; arg_ptr[arg_len++] = (c & 7) + '0'; } } else { arg_ptr[arg_len] = '\0'; } if (c) continue; /* Check. */ if (argv_count) { if (!tomoyo_argv(bprm->argc - argv_count, arg_ptr, argc, argv, checked)) { result = false; break; } argv_count--; } else if (envp_count) { char *cp = strchr(arg_ptr, '='); if (cp) { *cp = '\0'; if (!tomoyo_envp(arg_ptr, cp + 1, envc, envp, checked + argc)) { result = false; break; } } envp_count--; } else { break; } arg_len = 0; } offset = 0; if (!result) break; } out: if (result) { int i; /* Check not-yet-checked entries. */ for (i = 0; i < argc; i++) { if (checked[i]) continue; /* * Return true only if all unchecked indexes in * bprm->argv[] are not matched. */ if (argv[i].is_not) continue; result = false; break; } for (i = 0; i < envc; envp++, i++) { if (checked[argc + i]) continue; /* * Return true only if all unchecked environ variables * in bprm->envp[] are either undefined or not matched. */ if ((!envp->value && !envp->is_not) || (envp->value && envp->is_not)) continue; result = false; break; } } if (checked != local_checked) kfree(checked); return result; } /** * tomoyo_scan_exec_realpath - Check "exec.realpath" parameter of "struct tomoyo_condition". * * @file: Pointer to "struct file". * @ptr: Pointer to "struct tomoyo_name_union". * @match: True if "exec.realpath=", false if "exec.realpath!=". * * Returns true on success, false otherwise. */ static bool tomoyo_scan_exec_realpath(struct file *file, const struct tomoyo_name_union *ptr, const bool match) { bool result; struct tomoyo_path_info exe; if (!file) return false; exe.name = tomoyo_realpath_from_path(&file->f_path); if (!exe.name) return false; tomoyo_fill_path_info(&exe); result = tomoyo_compare_name_union(&exe, ptr); kfree(exe.name); return result == match; } /** * tomoyo_get_dqword - tomoyo_get_name() for a quoted string. * * @start: String to save. * * Returns pointer to "struct tomoyo_path_info" on success, NULL otherwise. */ static const struct tomoyo_path_info *tomoyo_get_dqword(char *start) { char *cp = start + strlen(start) - 1; if (cp == start || *start++ != '"' || *cp != '"') return NULL; *cp = '\0'; if (*start && !tomoyo_correct_word(start)) return NULL; return tomoyo_get_name(start); } /** * tomoyo_parse_name_union_quoted - Parse a quoted word. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_name_union". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_name_union_quoted(struct tomoyo_acl_param *param, struct tomoyo_name_union *ptr) { char *filename = param->data; if (*filename == '@') return tomoyo_parse_name_union(param, ptr); ptr->filename = tomoyo_get_dqword(filename); return ptr->filename != NULL; } /** * tomoyo_parse_argv - Parse an argv[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @argv: Pointer to "struct tomoyo_argv". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_argv(char *left, char *right, struct tomoyo_argv *argv) { if (tomoyo_parse_ulong(&argv->index, &left) != TOMOYO_VALUE_TYPE_DECIMAL || *left++ != ']' || *left) return false; argv->value = tomoyo_get_dqword(right); return argv->value != NULL; } /** * tomoyo_parse_envp - Parse an envp[] condition part. * * @left: Lefthand value. * @right: Righthand value. * @envp: Pointer to "struct tomoyo_envp". * * Returns true on success, false otherwise. */ static bool tomoyo_parse_envp(char *left, char *right, struct tomoyo_envp *envp) { const struct tomoyo_path_info *name; const struct tomoyo_path_info *value; char *cp = left + strlen(left) - 1; if (*cp-- != ']' || *cp != '"') goto out; *cp = '\0'; if (!tomoyo_correct_word(left)) goto out; name = tomoyo_get_name(left); if (!name) goto out; if (!strcmp(right, "NULL")) { value = NULL; } else { value = tomoyo_get_dqword(right); if (!value) { tomoyo_put_name(name); goto out; } } envp->name = name; envp->value = value; return true; out: return false; } /** * tomoyo_same_condition - Check for duplicated "struct tomoyo_condition" entry. * * @a: Pointer to "struct tomoyo_condition". * @b: Pointer to "struct tomoyo_condition". * * Returns true if @a == @b, false otherwise. */ static inline bool tomoyo_same_condition(const struct tomoyo_condition *a, const struct tomoyo_condition *b) { return a->size == b->size && a->condc == b->condc && a->numbers_count == b->numbers_count && a->names_count == b->names_count && a->argc == b->argc && a->envc == b->envc && a->grant_log == b->grant_log && a->transit == b->transit && !memcmp(a + 1, b + 1, a->size - sizeof(*a)); } /** * tomoyo_condition_type - Get condition type. * * @word: Keyword string. * * Returns one of values in "enum tomoyo_conditions_index" on success, * TOMOYO_MAX_CONDITION_KEYWORD otherwise. */ static u8 tomoyo_condition_type(const char *word) { u8 i; for (i = 0; i < TOMOYO_MAX_CONDITION_KEYWORD; i++) { if (!strcmp(word, tomoyo_condition_keyword[i])) break; } return i; } /* Define this to enable debug mode. */ /* #define DEBUG_CONDITION */ #ifdef DEBUG_CONDITION #define dprintk printk #else #define dprintk(...) do { } while (0) #endif /** * tomoyo_commit_condition - Commit "struct tomoyo_condition". * * @entry: Pointer to "struct tomoyo_condition". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. * * This function merges duplicated entries. This function returns NULL if * @entry is not duplicated but memory quota for policy has exceeded. */ static struct tomoyo_condition *tomoyo_commit_condition (struct tomoyo_condition *entry) { struct tomoyo_condition *ptr; bool found = false; if (mutex_lock_interruptible(&tomoyo_policy_lock)) { dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); ptr = NULL; found = true; goto out; } list_for_each_entry(ptr, &tomoyo_condition_list, head.list) { if (!tomoyo_same_condition(ptr, entry) || atomic_read(&ptr->head.users) == TOMOYO_GC_IN_PROGRESS) continue; /* Same entry found. Share this entry. */ atomic_inc(&ptr->head.users); found = true; break; } if (!found) { if (tomoyo_memory_ok(entry)) { atomic_set(&entry->head.users, 1); list_add(&entry->head.list, &tomoyo_condition_list); } else { found = true; ptr = NULL; } } mutex_unlock(&tomoyo_policy_lock); out: if (found) { tomoyo_del_condition(&entry->head.list); kfree(entry); entry = ptr; } return entry; } /** * tomoyo_get_transit_preference - Parse domain transition preference for execve(). * * @param: Pointer to "struct tomoyo_acl_param". * @e: Pointer to "struct tomoyo_condition". * * Returns the condition string part. */ static char *tomoyo_get_transit_preference(struct tomoyo_acl_param *param, struct tomoyo_condition *e) { char * const pos = param->data; bool flag; if (*pos == '<') { e->transit = tomoyo_get_domainname(param); goto done; } { char *cp = strchr(pos, ' '); if (cp) *cp = '\0'; flag = tomoyo_correct_path(pos) || !strcmp(pos, "keep") || !strcmp(pos, "initialize") || !strcmp(pos, "reset") || !strcmp(pos, "child") || !strcmp(pos, "parent"); if (cp) *cp = ' '; } if (!flag) return pos; e->transit = tomoyo_get_name(tomoyo_read_token(param)); done: if (e->transit) return param->data; /* * Return a bad read-only condition string that will let * tomoyo_get_condition() return NULL. */ return "/"; } /** * tomoyo_get_condition - Parse condition part. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns pointer to "struct tomoyo_condition" on success, NULL otherwise. */ struct tomoyo_condition *tomoyo_get_condition(struct tomoyo_acl_param *param) { struct tomoyo_condition *entry = NULL; struct tomoyo_condition_element *condp = NULL; struct tomoyo_number_union *numbers_p = NULL; struct tomoyo_name_union *names_p = NULL; struct tomoyo_argv *argv = NULL; struct tomoyo_envp *envp = NULL; struct tomoyo_condition e = { }; char * const start_of_string = tomoyo_get_transit_preference(param, &e); char * const end_of_string = start_of_string + strlen(start_of_string); char *pos; rerun: pos = start_of_string; while (1) { u8 left = -1; u8 right = -1; char *left_word = pos; char *cp; char *right_word; bool is_not; if (!*left_word) break; /* * Since left-hand condition does not allow use of "path_group" * or "number_group" and environment variable's names do not * accept '=', it is guaranteed that the original line consists * of one or more repetition of $left$operator$right blocks * where "$left is free from '=' and ' '" and "$operator is * either '=' or '!='" and "$right is free from ' '". * Therefore, we can reconstruct the original line at the end * of dry run even if we overwrite $operator with '\0'. */ cp = strchr(pos, ' '); if (cp) { *cp = '\0'; /* Will restore later. */ pos = cp + 1; } else { pos = ""; } right_word = strchr(left_word, '='); if (!right_word || right_word == left_word) goto out; is_not = *(right_word - 1) == '!'; if (is_not) *(right_word++ - 1) = '\0'; /* Will restore later. */ else if (*(right_word + 1) != '=') *right_word++ = '\0'; /* Will restore later. */ else goto out; dprintk(KERN_WARNING "%u: <%s>%s=<%s>\n", __LINE__, left_word, is_not ? "!" : "", right_word); if (!strcmp(left_word, "grant_log")) { if (entry) { if (is_not || entry->grant_log != TOMOYO_GRANTLOG_AUTO) goto out; else if (!strcmp(right_word, "yes")) entry->grant_log = TOMOYO_GRANTLOG_YES; else if (!strcmp(right_word, "no")) entry->grant_log = TOMOYO_GRANTLOG_NO; else goto out; } continue; } if (!strncmp(left_word, "exec.argv[", 10)) { if (!argv) { e.argc++; e.condc++; } else { e.argc--; e.condc--; left = TOMOYO_ARGV_ENTRY; argv->is_not = is_not; if (!tomoyo_parse_argv(left_word + 10, right_word, argv++)) goto out; } goto store_value; } if (!strncmp(left_word, "exec.envp[\"", 11)) { if (!envp) { e.envc++; e.condc++; } else { e.envc--; e.condc--; left = TOMOYO_ENVP_ENTRY; envp->is_not = is_not; if (!tomoyo_parse_envp(left_word + 11, right_word, envp++)) goto out; } goto store_value; } left = tomoyo_condition_type(left_word); dprintk(KERN_WARNING "%u: <%s> left=%u\n", __LINE__, left_word, left); if (left == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; left = TOMOYO_NUMBER_UNION; param->data = left_word; if (*left_word == '@' || !tomoyo_parse_number_union(param, numbers_p++)) goto out; } } if (!condp) e.condc++; else e.condc--; if (left == TOMOYO_EXEC_REALPATH || left == TOMOYO_SYMLINK_TARGET) { if (!names_p) { e.names_count++; } else { e.names_count--; right = TOMOYO_NAME_UNION; param->data = right_word; if (!tomoyo_parse_name_union_quoted(param, names_p++)) goto out; } goto store_value; } right = tomoyo_condition_type(right_word); if (right == TOMOYO_MAX_CONDITION_KEYWORD) { if (!numbers_p) { e.numbers_count++; } else { e.numbers_count--; right = TOMOYO_NUMBER_UNION; param->data = right_word; if (!tomoyo_parse_number_union(param, numbers_p++)) goto out; } } store_value: if (!condp) { dprintk(KERN_WARNING "%u: dry_run left=%u right=%u match=%u\n", __LINE__, left, right, !is_not); continue; } condp->left = left; condp->right = right; condp->equals = !is_not; dprintk(KERN_WARNING "%u: left=%u right=%u match=%u\n", __LINE__, condp->left, condp->right, condp->equals); condp++; } dprintk(KERN_INFO "%u: cond=%u numbers=%u names=%u ac=%u ec=%u\n", __LINE__, e.condc, e.numbers_count, e.names_count, e.argc, e.envc); if (entry) { BUG_ON(e.names_count | e.numbers_count | e.argc | e.envc | e.condc); return tomoyo_commit_condition(entry); } e.size = sizeof(*entry) + e.condc * sizeof(struct tomoyo_condition_element) + e.numbers_count * sizeof(struct tomoyo_number_union) + e.names_count * sizeof(struct tomoyo_name_union) + e.argc * sizeof(struct tomoyo_argv) + e.envc * sizeof(struct tomoyo_envp); entry = kzalloc(e.size, GFP_NOFS); if (!entry) goto out2; *entry = e; e.transit = NULL; condp = (struct tomoyo_condition_element *) (entry + 1); numbers_p = (struct tomoyo_number_union *) (condp + e.condc); names_p = (struct tomoyo_name_union *) (numbers_p + e.numbers_count); argv = (struct tomoyo_argv *) (names_p + e.names_count); envp = (struct tomoyo_envp *) (argv + e.argc); { bool flag = false; for (pos = start_of_string; pos < end_of_string; pos++) { if (*pos) continue; if (flag) /* Restore " ". */ *pos = ' '; else if (*(pos + 1) == '=') /* Restore "!=". */ *pos = '!'; else /* Restore "=". */ *pos = '='; flag = !flag; } } goto rerun; out: dprintk(KERN_WARNING "%u: %s failed\n", __LINE__, __func__); if (entry) { tomoyo_del_condition(&entry->head.list); kfree(entry); } out2: tomoyo_put_name(e.transit); return NULL; } /** * tomoyo_get_attributes - Revalidate "struct inode". * * @obj: Pointer to "struct tomoyo_obj_info". * * Returns nothing. */ void tomoyo_get_attributes(struct tomoyo_obj_info *obj) { u8 i; struct dentry *dentry = NULL; for (i = 0; i < TOMOYO_MAX_PATH_STAT; i++) { struct inode *inode; switch (i) { case TOMOYO_PATH1: dentry = obj->path1.dentry; if (!dentry) continue; break; case TOMOYO_PATH2: dentry = obj->path2.dentry; if (!dentry) continue; break; default: if (!dentry) continue; dentry = dget_parent(dentry); break; } inode = d_backing_inode(dentry); if (inode) { struct tomoyo_mini_stat *stat = &obj->stat[i]; stat->uid = inode->i_uid; stat->gid = inode->i_gid; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->dev = inode->i_sb->s_dev; stat->rdev = inode->i_rdev; obj->stat_valid[i] = true; } if (i & 1) /* TOMOYO_PATH1_PARENT or TOMOYO_PATH2_PARENT */ dput(dentry); } } /** * tomoyo_condition - Check condition part. * * @r: Pointer to "struct tomoyo_request_info". * @cond: Pointer to "struct tomoyo_condition". Maybe NULL. * * Returns true on success, false otherwise. * * Caller holds tomoyo_read_lock(). */ bool tomoyo_condition(struct tomoyo_request_info *r, const struct tomoyo_condition *cond) { u32 i; unsigned long min_v[2] = { 0, 0 }; unsigned long max_v[2] = { 0, 0 }; const struct tomoyo_condition_element *condp; const struct tomoyo_number_union *numbers_p; const struct tomoyo_name_union *names_p; const struct tomoyo_argv *argv; const struct tomoyo_envp *envp; struct tomoyo_obj_info *obj; u16 condc; u16 argc; u16 envc; struct linux_binprm *bprm = NULL; if (!cond) return true; condc = cond->condc; argc = cond->argc; envc = cond->envc; obj = r->obj; if (r->ee) bprm = r->ee->bprm; if (!bprm && (argc || envc)) return false; condp = (struct tomoyo_condition_element *) (cond + 1); numbers_p = (const struct tomoyo_number_union *) (condp + condc); names_p = (const struct tomoyo_name_union *) (numbers_p + cond->numbers_count); argv = (const struct tomoyo_argv *) (names_p + cond->names_count); envp = (const struct tomoyo_envp *) (argv + argc); for (i = 0; i < condc; i++) { const bool match = condp->equals; const u8 left = condp->left; const u8 right = condp->right; bool is_bitop[2] = { false, false }; u8 j; condp++; /* Check argv[] and envp[] later. */ if (left == TOMOYO_ARGV_ENTRY || left == TOMOYO_ENVP_ENTRY) continue; /* Check string expressions. */ if (right == TOMOYO_NAME_UNION) { const struct tomoyo_name_union *ptr = names_p++; struct tomoyo_path_info *symlink; struct tomoyo_execve *ee; struct file *file; switch (left) { case TOMOYO_SYMLINK_TARGET: symlink = obj ? obj->symlink_target : NULL; if (!symlink || !tomoyo_compare_name_union(symlink, ptr) == match) goto out; break; case TOMOYO_EXEC_REALPATH: ee = r->ee; file = ee ? ee->bprm->file : NULL; if (!tomoyo_scan_exec_realpath(file, ptr, match)) goto out; break; } continue; } /* Check numeric or bit-op expressions. */ for (j = 0; j < 2; j++) { const u8 index = j ? right : left; unsigned long value = 0; switch (index) { case TOMOYO_TASK_UID: value = from_kuid(&init_user_ns, current_uid()); break; case TOMOYO_TASK_EUID: value = from_kuid(&init_user_ns, current_euid()); break; case TOMOYO_TASK_SUID: value = from_kuid(&init_user_ns, current_suid()); break; case TOMOYO_TASK_FSUID: value = from_kuid(&init_user_ns, current_fsuid()); break; case TOMOYO_TASK_GID: value = from_kgid(&init_user_ns, current_gid()); break; case TOMOYO_TASK_EGID: value = from_kgid(&init_user_ns, current_egid()); break; case TOMOYO_TASK_SGID: value = from_kgid(&init_user_ns, current_sgid()); break; case TOMOYO_TASK_FSGID: value = from_kgid(&init_user_ns, current_fsgid()); break; case TOMOYO_TASK_PID: value = tomoyo_sys_getpid(); break; case TOMOYO_TASK_PPID: value = tomoyo_sys_getppid(); break; case TOMOYO_TYPE_IS_SOCKET: value = S_IFSOCK; break; case TOMOYO_TYPE_IS_SYMLINK: value = S_IFLNK; break; case TOMOYO_TYPE_IS_FILE: value = S_IFREG; break; case TOMOYO_TYPE_IS_BLOCK_DEV: value = S_IFBLK; break; case TOMOYO_TYPE_IS_DIRECTORY: value = S_IFDIR; break; case TOMOYO_TYPE_IS_CHAR_DEV: value = S_IFCHR; break; case TOMOYO_TYPE_IS_FIFO: value = S_IFIFO; break; case TOMOYO_MODE_SETUID: value = S_ISUID; break; case TOMOYO_MODE_SETGID: value = S_ISGID; break; case TOMOYO_MODE_STICKY: value = S_ISVTX; break; case TOMOYO_MODE_OWNER_READ: value = 0400; break; case TOMOYO_MODE_OWNER_WRITE: value = 0200; break; case TOMOYO_MODE_OWNER_EXECUTE: value = 0100; break; case TOMOYO_MODE_GROUP_READ: value = 0040; break; case TOMOYO_MODE_GROUP_WRITE: value = 0020; break; case TOMOYO_MODE_GROUP_EXECUTE: value = 0010; break; case TOMOYO_MODE_OTHERS_READ: value = 0004; break; case TOMOYO_MODE_OTHERS_WRITE: value = 0002; break; case TOMOYO_MODE_OTHERS_EXECUTE: value = 0001; break; case TOMOYO_EXEC_ARGC: if (!bprm) goto out; value = bprm->argc; break; case TOMOYO_EXEC_ENVC: if (!bprm) goto out; value = bprm->envc; break; case TOMOYO_NUMBER_UNION: /* Fetch values later. */ break; default: if (!obj) goto out; if (!obj->validate_done) { tomoyo_get_attributes(obj); obj->validate_done = true; } { u8 stat_index; struct tomoyo_mini_stat *stat; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH1_GID: case TOMOYO_PATH1_INO: case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH1_MINOR: case TOMOYO_PATH1_TYPE: case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH1_PERM: stat_index = TOMOYO_PATH1; break; case TOMOYO_PATH2_UID: case TOMOYO_PATH2_GID: case TOMOYO_PATH2_INO: case TOMOYO_PATH2_MAJOR: case TOMOYO_PATH2_MINOR: case TOMOYO_PATH2_TYPE: case TOMOYO_PATH2_DEV_MAJOR: case TOMOYO_PATH2_DEV_MINOR: case TOMOYO_PATH2_PERM: stat_index = TOMOYO_PATH2; break; case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH1_PARENT_PERM: stat_index = TOMOYO_PATH1_PARENT; break; case TOMOYO_PATH2_PARENT_UID: case TOMOYO_PATH2_PARENT_GID: case TOMOYO_PATH2_PARENT_INO: case TOMOYO_PATH2_PARENT_PERM: stat_index = TOMOYO_PATH2_PARENT; break; default: goto out; } if (!obj->stat_valid[stat_index]) goto out; stat = &obj->stat[stat_index]; switch (index) { case TOMOYO_PATH1_UID: case TOMOYO_PATH2_UID: case TOMOYO_PATH1_PARENT_UID: case TOMOYO_PATH2_PARENT_UID: value = from_kuid(&init_user_ns, stat->uid); break; case TOMOYO_PATH1_GID: case TOMOYO_PATH2_GID: case TOMOYO_PATH1_PARENT_GID: case TOMOYO_PATH2_PARENT_GID: value = from_kgid(&init_user_ns, stat->gid); break; case TOMOYO_PATH1_INO: case TOMOYO_PATH2_INO: case TOMOYO_PATH1_PARENT_INO: case TOMOYO_PATH2_PARENT_INO: value = stat->ino; break; case TOMOYO_PATH1_MAJOR: case TOMOYO_PATH2_MAJOR: value = MAJOR(stat->dev); break; case TOMOYO_PATH1_MINOR: case TOMOYO_PATH2_MINOR: value = MINOR(stat->dev); break; case TOMOYO_PATH1_TYPE: case TOMOYO_PATH2_TYPE: value = stat->mode & S_IFMT; break; case TOMOYO_PATH1_DEV_MAJOR: case TOMOYO_PATH2_DEV_MAJOR: value = MAJOR(stat->rdev); break; case TOMOYO_PATH1_DEV_MINOR: case TOMOYO_PATH2_DEV_MINOR: value = MINOR(stat->rdev); break; case TOMOYO_PATH1_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PARENT_PERM: value = stat->mode & S_IALLUGO; break; } } break; } max_v[j] = value; min_v[j] = value; switch (index) { case TOMOYO_MODE_SETUID: case TOMOYO_MODE_SETGID: case TOMOYO_MODE_STICKY: case TOMOYO_MODE_OWNER_READ: case TOMOYO_MODE_OWNER_WRITE: case TOMOYO_MODE_OWNER_EXECUTE: case TOMOYO_MODE_GROUP_READ: case TOMOYO_MODE_GROUP_WRITE: case TOMOYO_MODE_GROUP_EXECUTE: case TOMOYO_MODE_OTHERS_READ: case TOMOYO_MODE_OTHERS_WRITE: case TOMOYO_MODE_OTHERS_EXECUTE: is_bitop[j] = true; } } if (left == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; min_v[0] = ptr->values[0]; max_v[0] = ptr->values[1]; } if (right == TOMOYO_NUMBER_UNION) { /* Fetch values now. */ const struct tomoyo_number_union *ptr = numbers_p++; if (ptr->group) { if (tomoyo_number_matches_group(min_v[0], max_v[0], ptr->group) == match) continue; } else { if ((min_v[0] <= ptr->values[1] && max_v[0] >= ptr->values[0]) == match) continue; } goto out; } /* * Bit operation is valid only when counterpart value * represents permission. */ if (is_bitop[0] && is_bitop[1]) { goto out; } else if (is_bitop[0]) { switch (right) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } else if (is_bitop[1]) { switch (left) { case TOMOYO_PATH1_PERM: case TOMOYO_PATH1_PARENT_PERM: case TOMOYO_PATH2_PERM: case TOMOYO_PATH2_PARENT_PERM: if (!(max_v[0] & max_v[1]) == !match) continue; } goto out; } /* Normal value range comparison. */ if ((min_v[0] <= max_v[1] && max_v[0] >= min_v[1]) == match) continue; out: return false; } /* Check argv[] and envp[] now. */ if (r->ee && (argc || envc)) return tomoyo_scan_bprm(r->ee, argc, argv, envc, envp); return true; }
47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 // SPDX-License-Identifier: GPL-2.0 #include <net/genetlink.h> #include <net/ila.h> #include <net/netns/generic.h> #include <uapi/linux/genetlink.h> #include "ila.h" static const struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = { [ILA_ATTR_LOCATOR] = { .type = NLA_U64, }, [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, }, [ILA_ATTR_IFINDEX] = { .type = NLA_U32, }, [ILA_ATTR_CSUM_MODE] = { .type = NLA_U8, }, [ILA_ATTR_IDENT_TYPE] = { .type = NLA_U8, }, }; static const struct genl_ops ila_nl_ops[] = { { .cmd = ILA_CMD_ADD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_add_mapping, .flags = GENL_ADMIN_PERM, }, { .cmd = ILA_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_del_mapping, .flags = GENL_ADMIN_PERM, }, { .cmd = ILA_CMD_FLUSH, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_flush, .flags = GENL_ADMIN_PERM, }, { .cmd = ILA_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ila_xlat_nl_cmd_get_mapping, .start = ila_xlat_nl_dump_start, .dumpit = ila_xlat_nl_dump, .done = ila_xlat_nl_dump_done, }, }; unsigned int ila_net_id; struct genl_family ila_nl_family __ro_after_init = { .hdrsize = 0, .name = ILA_GENL_NAME, .version = ILA_GENL_VERSION, .maxattr = ILA_ATTR_MAX, .policy = ila_nl_policy, .netnsok = true, .parallel_ops = true, .module = THIS_MODULE, .ops = ila_nl_ops, .n_ops = ARRAY_SIZE(ila_nl_ops), }; static __net_init int ila_init_net(struct net *net) { int err; err = ila_xlat_init_net(net); if (err) goto ila_xlat_init_fail; return 0; ila_xlat_init_fail: return err; } static __net_exit void ila_pre_exit_net(struct net *net) { ila_xlat_pre_exit_net(net); } static __net_exit void ila_exit_net(struct net *net) { ila_xlat_exit_net(net); } static struct pernet_operations ila_net_ops = { .init = ila_init_net, .pre_exit = ila_pre_exit_net, .exit = ila_exit_net, .id = &ila_net_id, .size = sizeof(struct ila_net), }; static int __init ila_init(void) { int ret; ret = register_pernet_device(&ila_net_ops); if (ret) goto register_device_fail; ret = genl_register_family(&ila_nl_family); if (ret) goto register_family_fail; ret = ila_lwt_init(); if (ret) goto fail_lwt; return 0; fail_lwt: genl_unregister_family(&ila_nl_family); register_family_fail: unregister_pernet_device(&ila_net_ops); register_device_fail: return ret; } static void __exit ila_fini(void) { ila_lwt_fini(); genl_unregister_family(&ila_nl_family); unregister_pernet_device(&ila_net_ops); } module_init(ila_init); module_exit(ila_fini); MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv6: Identifier Locator Addressing (ILA)");
12 3692 74 588 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_BIT_SPINLOCK_H #define __LINUX_BIT_SPINLOCK_H #include <linux/kernel.h> #include <linux/preempt.h> #include <linux/atomic.h> #include <linux/bug.h> /* * bit-based spin_lock() * * Don't use this unless you really need to: spin_lock() and spin_unlock() * are significantly faster. */ static inline void bit_spin_lock(int bitnum, unsigned long *addr) { /* * Assuming the lock is uncontended, this never enters * the body of the outer loop. If it is contended, then * within the inner loop a non-atomic test is used to * busywait with less bus contention for a good time to * attempt to acquire the lock bit. */ preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) while (unlikely(test_and_set_bit_lock(bitnum, addr))) { preempt_enable(); do { cpu_relax(); } while (test_bit(bitnum, addr)); preempt_disable(); } #endif __acquire(bitlock); } /* * Return true if it was acquired */ static inline int bit_spin_trylock(int bitnum, unsigned long *addr) { preempt_disable(); #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) if (unlikely(test_and_set_bit_lock(bitnum, addr))) { preempt_enable(); return 0; } #endif __acquire(bitlock); return 1; } /* * bit-based spin_unlock() */ static inline void bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) clear_bit_unlock(bitnum, addr); #endif preempt_enable(); __release(bitlock); } /* * bit-based spin_unlock() * non-atomic version, which can be used eg. if the bit lock itself is * protecting the rest of the flags in the word. */ static inline void __bit_spin_unlock(int bitnum, unsigned long *addr) { #ifdef CONFIG_DEBUG_SPINLOCK BUG_ON(!test_bit(bitnum, addr)); #endif #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) __clear_bit_unlock(bitnum, addr); #endif preempt_enable(); __release(bitlock); } /* * Return true if the lock is held. */ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) { #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) return test_bit(bitnum, addr); #elif defined CONFIG_PREEMPT_COUNT return preempt_count(); #else return 1; #endif } #endif /* __LINUX_BIT_SPINLOCK_H */
79 3 71 79 3 79 79 79 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/char_dev.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/init.h> #include <linux/fs.h> #include <linux/kdev_t.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/major.h> #include <linux/errno.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/kobject.h> #include <linux/kobj_map.h> #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> #include <linux/tty.h> #include "internal.h" static struct kobj_map *cdev_map; static DEFINE_MUTEX(chrdevs_lock); #define CHRDEV_MAJOR_HASH_SIZE 255 static struct char_device_struct { struct char_device_struct *next; unsigned int major; unsigned int baseminor; int minorct; char name[64]; struct cdev *cdev; /* will die */ } *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; /* index in the above */ static inline int major_to_index(unsigned major) { return major % CHRDEV_MAJOR_HASH_SIZE; } #ifdef CONFIG_PROC_FS void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; mutex_lock(&chrdevs_lock); for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) { if (cd->major == offset) seq_printf(f, "%3d %s\n", cd->major, cd->name); } mutex_unlock(&chrdevs_lock); } #endif /* CONFIG_PROC_FS */ static int find_dynamic_major(void) { int i; struct char_device_struct *cd; for (i = ARRAY_SIZE(chrdevs)-1; i >= CHRDEV_MAJOR_DYN_END; i--) { if (chrdevs[i] == NULL) return i; } for (i = CHRDEV_MAJOR_DYN_EXT_START; i >= CHRDEV_MAJOR_DYN_EXT_END; i--) { for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) if (cd->major == i) break; if (cd == NULL) return i; } return -EBUSY; } /* * Register a single major with a specified minor range. * * If major == 0 this function will dynamically allocate an unused major. * If major > 0 this function will attempt to reserve the range of minors * with given major. * */ static struct char_device_struct * __register_chrdev_region(unsigned int major, unsigned int baseminor, int minorct, const char *name) { struct char_device_struct *cd, *curr, *prev = NULL; int ret; int i; if (major >= CHRDEV_MAJOR_MAX) { pr_err("CHRDEV \"%s\" major requested (%u) is greater than the maximum (%u)\n", name, major, CHRDEV_MAJOR_MAX-1); return ERR_PTR(-EINVAL); } if (minorct > MINORMASK + 1 - baseminor) { pr_err("CHRDEV \"%s\" minor range requested (%u-%u) is out of range of maximum range (%u-%u) for a single major\n", name, baseminor, baseminor + minorct - 1, 0, MINORMASK); return ERR_PTR(-EINVAL); } cd = kzalloc(sizeof(struct char_device_struct), GFP_KERNEL); if (cd == NULL) return ERR_PTR(-ENOMEM); mutex_lock(&chrdevs_lock); if (major == 0) { ret = find_dynamic_major(); if (ret < 0) { pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", name); goto out; } major = ret; } ret = -EBUSY; i = major_to_index(major); for (curr = chrdevs[i]; curr; prev = curr, curr = curr->next) { if (curr->major < major) continue; if (curr->major > major) break; if (curr->baseminor + curr->minorct <= baseminor) continue; if (curr->baseminor >= baseminor + minorct) break; goto out; } cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; strlcpy(cd->name, name, sizeof(cd->name)); if (!prev) { cd->next = curr; chrdevs[i] = cd; } else { cd->next = prev->next; prev->next = cd; } mutex_unlock(&chrdevs_lock); return cd; out: mutex_unlock(&chrdevs_lock); kfree(cd); return ERR_PTR(ret); } static struct char_device_struct * __unregister_chrdev_region(unsigned major, unsigned baseminor, int minorct) { struct char_device_struct *cd = NULL, **cp; int i = major_to_index(major); mutex_lock(&chrdevs_lock); for (cp = &chrdevs[i]; *cp; cp = &(*cp)->next) if ((*cp)->major == major && (*cp)->baseminor == baseminor && (*cp)->minorct == minorct) break; if (*cp) { cd = *cp; *cp = cd->next; } mutex_unlock(&chrdevs_lock); return cd; } /** * register_chrdev_region() - register a range of device numbers * @from: the first in the desired range of device numbers; must include * the major number. * @count: the number of consecutive device numbers required * @name: the name of the device or driver. * * Return value is zero on success, a negative error code on failure. */ int register_chrdev_region(dev_t from, unsigned count, const char *name) { struct char_device_struct *cd; dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; cd = __register_chrdev_region(MAJOR(n), MINOR(n), next - n, name); if (IS_ERR(cd)) goto fail; } return 0; fail: to = n; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } return PTR_ERR(cd); } /** * alloc_chrdev_region() - register a range of char device numbers * @dev: output parameter for first assigned number * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: the name of the associated device or driver * * Allocates a range of char device numbers. The major number will be * chosen dynamically, and returned (along with the first minor number) * in @dev. Returns zero or a negative error code. */ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, const char *name) { struct char_device_struct *cd; cd = __register_chrdev_region(0, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); *dev = MKDEV(cd->major, cd->baseminor); return 0; } /** * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation * @baseminor: first of the requested range of minor numbers * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * * If @major == 0 this functions will dynamically allocate a major and return * its number. * * If @major > 0 this function will attempt to reserve a device with the given * major number and will return zero on success. * * Returns a -ve errno on failure. * * The name of this device has nothing to do with the name of the device in * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. */ int __register_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name, const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; int err = -ENOMEM; cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); cdev = cdev_alloc(); if (!cdev) goto out2; cdev->owner = fops->owner; cdev->ops = fops; kobject_set_name(&cdev->kobj, "%s", name); err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; cd->cdev = cdev; return major ? 0 : cd->major; out: kobject_put(&cdev->kobj); out2: kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } /** * unregister_chrdev_region() - unregister a range of device numbers * @from: the first in the range of numbers to unregister * @count: the number of device numbers to unregister * * This function will unregister a range of @count device numbers, * starting with @from. The caller should normally be the one who * allocated those numbers in the first place... */ void unregister_chrdev_region(dev_t from, unsigned count) { dev_t to = from + count; dev_t n, next; for (n = from; n < to; n = next) { next = MKDEV(MAJOR(n)+1, 0); if (next > to) next = to; kfree(__unregister_chrdev_region(MAJOR(n), MINOR(n), next - n)); } } /** * __unregister_chrdev - unregister and destroy a cdev * @major: major device number * @baseminor: first of the range of minor numbers * @count: the number of minor numbers this cdev is occupying * @name: name of this range of devices * * Unregister and destroy the cdev occupying the region described by * @major, @baseminor and @count. This function undoes what * __register_chrdev() did. */ void __unregister_chrdev(unsigned int major, unsigned int baseminor, unsigned int count, const char *name) { struct char_device_struct *cd; cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); } static DEFINE_SPINLOCK(cdev_lock); static struct kobject *cdev_get(struct cdev *p) { struct module *owner = p->owner; struct kobject *kobj; if (owner && !try_module_get(owner)) return NULL; kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) module_put(owner); return kobj; } void cdev_put(struct cdev *p) { if (p) { struct module *owner = p->owner; kobject_put(&p->kobj); module_put(owner); } } /* * Called every time a character special file is opened */ static int chrdev_open(struct inode *inode, struct file *filp) { const struct file_operations *fops; struct cdev *p; struct cdev *new = NULL; int ret = 0; spin_lock(&cdev_lock); p = inode->i_cdev; if (!p) { struct kobject *kobj; int idx; spin_unlock(&cdev_lock); kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx); if (!kobj) return -ENXIO; new = container_of(kobj, struct cdev, kobj); spin_lock(&cdev_lock); /* Check i_cdev again in case somebody beat us to it while we dropped the lock. */ p = inode->i_cdev; if (!p) { inode->i_cdev = p = new; list_add(&inode->i_devices, &p->list); new = NULL; } else if (!cdev_get(p)) ret = -ENXIO; } else if (!cdev_get(p)) ret = -ENXIO; spin_unlock(&cdev_lock); cdev_put(new); if (ret) return ret; ret = -ENXIO; fops = fops_get(p->ops); if (!fops) goto out_cdev_put; replace_fops(filp, fops); if (filp->f_op->open) { ret = filp->f_op->open(inode, filp); if (ret) goto out_cdev_put; } return 0; out_cdev_put: cdev_put(p); return ret; } void cd_forget(struct inode *inode) { spin_lock(&cdev_lock); list_del_init(&inode->i_devices); inode->i_cdev = NULL; inode->i_mapping = &inode->i_data; spin_unlock(&cdev_lock); } static void cdev_purge(struct cdev *cdev) { spin_lock(&cdev_lock); while (!list_empty(&cdev->list)) { struct inode *inode; inode = container_of(cdev->list.next, struct inode, i_devices); list_del_init(&inode->i_devices); inode->i_cdev = NULL; } spin_unlock(&cdev_lock); } /* * Dummy default file-operations: the only thing this does * is contain the open that then fills in the correct operations * depending on the special file... */ const struct file_operations def_chr_fops = { .open = chrdev_open, .llseek = noop_llseek, }; static struct kobject *exact_match(dev_t dev, int *part, void *data) { struct cdev *p = data; return &p->kobj; } static int exact_lock(dev_t dev, void *data) { struct cdev *p = data; return cdev_get(p) ? 0 : -1; } /** * cdev_add() - add a char device to the system * @p: the cdev structure for the device * @dev: the first device number for which this device is responsible * @count: the number of consecutive minor numbers corresponding to this * device * * cdev_add() adds the device represented by @p to the system, making it * live immediately. A negative error code is returned on failure. */ int cdev_add(struct cdev *p, dev_t dev, unsigned count) { int error; p->dev = dev; p->count = count; if (WARN_ON(dev == WHITEOUT_DEV)) return -EBUSY; error = kobj_map(cdev_map, dev, count, NULL, exact_match, exact_lock, p); if (error) return error; kobject_get(p->kobj.parent); return 0; } /** * cdev_set_parent() - set the parent kobject for a char device * @p: the cdev structure * @kobj: the kobject to take a reference to * * cdev_set_parent() sets a parent kobject which will be referenced * appropriately so the parent is not freed before the cdev. This * should be called before cdev_add. */ void cdev_set_parent(struct cdev *p, struct kobject *kobj) { WARN_ON(!kobj->state_initialized); p->kobj.parent = kobj; } /** * cdev_device_add() - add a char device and it's corresponding * struct device, linkink * @dev: the device structure * @cdev: the cdev structure * * cdev_device_add() adds the char device represented by @cdev to the system, * just as cdev_add does. It then adds @dev to the system using device_add * The dev_t for the char device will be taken from the struct device which * needs to be initialized first. This helper function correctly takes a * reference to the parent device so the parent will not get released until * all references to the cdev are released. * * This helper uses dev->devt for the device number. If it is not set * it will not add the cdev and it will be equivalent to device_add. * * This function should be used whenever the struct cdev and the * struct device are members of the same structure whose lifetime is * managed by the struct device. * * NOTE: Callers must assume that userspace was able to open the cdev and * can call cdev fops callbacks at any time, even if this function fails. */ int cdev_device_add(struct cdev *cdev, struct device *dev) { int rc = 0; if (dev->devt) { cdev_set_parent(cdev, &dev->kobj); rc = cdev_add(cdev, dev->devt, 1); if (rc) return rc; } rc = device_add(dev); if (rc && dev->devt) cdev_del(cdev); return rc; } /** * cdev_device_del() - inverse of cdev_device_add * @dev: the device structure * @cdev: the cdev structure * * cdev_device_del() is a helper function to call cdev_del and device_del. * It should be used whenever cdev_device_add is used. * * If dev->devt is not set it will not remove the cdev and will be equivalent * to device_del. * * NOTE: This guarantees that associated sysfs callbacks are not running * or runnable, however any cdevs already open will remain and their fops * will still be callable even after this function returns. */ void cdev_device_del(struct cdev *cdev, struct device *dev) { device_del(dev); if (dev->devt) cdev_del(cdev); } static void cdev_unmap(dev_t dev, unsigned count) { kobj_unmap(cdev_map, dev, count); } /** * cdev_del() - remove a cdev from the system * @p: the cdev structure to be removed * * cdev_del() removes @p from the system, possibly freeing the structure * itself. * * NOTE: This guarantees that cdev device will no longer be able to be * opened, however any cdevs already open will remain and their fops will * still be callable even after cdev_del returns. */ void cdev_del(struct cdev *p) { cdev_unmap(p->dev, p->count); kobject_put(&p->kobj); } static void cdev_default_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kobject_put(parent); } static void cdev_dynamic_release(struct kobject *kobj) { struct cdev *p = container_of(kobj, struct cdev, kobj); struct kobject *parent = kobj->parent; cdev_purge(p); kfree(p); kobject_put(parent); } static struct kobj_type ktype_cdev_default = { .release = cdev_default_release, }; static struct kobj_type ktype_cdev_dynamic = { .release = cdev_dynamic_release, }; /** * cdev_alloc() - allocate a cdev structure * * Allocates and returns a cdev structure, or NULL on failure. */ struct cdev *cdev_alloc(void) { struct cdev *p = kzalloc(sizeof(struct cdev), GFP_KERNEL); if (p) { INIT_LIST_HEAD(&p->list); kobject_init(&p->kobj, &ktype_cdev_dynamic); } return p; } /** * cdev_init() - initialize a cdev structure * @cdev: the structure to initialize * @fops: the file_operations for this device * * Initializes @cdev, remembering @fops, making it ready to add to the * system with cdev_add(). */ void cdev_init(struct cdev *cdev, const struct file_operations *fops) { memset(cdev, 0, sizeof *cdev); INIT_LIST_HEAD(&cdev->list); kobject_init(&cdev->kobj, &ktype_cdev_default); cdev->ops = fops; } static struct kobject *base_probe(dev_t dev, int *part, void *data) { if (request_module("char-major-%d-%d", MAJOR(dev), MINOR(dev)) > 0) /* Make old-style 2.4 aliases work */ request_module("char-major-%d", MAJOR(dev)); return NULL; } void __init chrdev_init(void) { cdev_map = kobj_map_init(base_probe, &chrdevs_lock); } /* Let modules do char dev stuff */ EXPORT_SYMBOL(register_chrdev_region); EXPORT_SYMBOL(unregister_chrdev_region); EXPORT_SYMBOL(alloc_chrdev_region); EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_set_parent); EXPORT_SYMBOL(cdev_device_add); EXPORT_SYMBOL(cdev_device_del); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev);
709 707 709 709 709 709 323 322 321 323 709 709 708 706 708 709 490 492 706 706 708 706 489 685 708 708 707 707 709 706 709 708 709 706 706 681 707 707 708 709 707 708 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 // SPDX-License-Identifier: GPL-2.0-or-later /* * x86 instruction analysis * * Copyright (C) IBM Corporation, 2002, 2004, 2009 */ #include <linux/kernel.h> #ifdef __KERNEL__ #include <linux/string.h> #else #include <string.h> #endif #include <asm/inat.h> /*__ignore_sync_check__ */ #include <asm/insn.h> /* __ignore_sync_check__ */ #include <asm/unaligned.h> /* __ignore_sync_check__ */ #include <linux/errno.h> #include <linux/kconfig.h> #include <asm/emulate_prefix.h> /* __ignore_sync_check__ */ #define leXX_to_cpu(t, r) \ ({ \ __typeof__(t) v; \ switch (sizeof(t)) { \ case 4: v = le32_to_cpu(r); break; \ case 2: v = le16_to_cpu(r); break; \ case 1: v = r; break; \ default: \ BUILD_BUG(); break; \ } \ v; \ }) /* Verify next sizeof(t) bytes can be on the same instruction */ #define validate_next(t, insn, n) \ ((insn)->next_byte + sizeof(t) + n <= (insn)->end_kaddr) #define __get_next(t, insn) \ ({ t r = get_unaligned((t *)(insn)->next_byte); (insn)->next_byte += sizeof(t); leXX_to_cpu(t, r); }) #define __peek_nbyte_next(t, insn, n) \ ({ t r = get_unaligned((t *)(insn)->next_byte + n); leXX_to_cpu(t, r); }) #define get_next(t, insn) \ ({ if (unlikely(!validate_next(t, insn, 0))) goto err_out; __get_next(t, insn); }) #define peek_nbyte_next(t, insn, n) \ ({ if (unlikely(!validate_next(t, insn, n))) goto err_out; __peek_nbyte_next(t, insn, n); }) #define peek_next(t, insn) peek_nbyte_next(t, insn, 0) /** * insn_init() - initialize struct insn * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @buf_len: length of the insn buffer at @kaddr * @x86_64: !0 for 64-bit kernel or 64-bit app */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) { /* * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid * even if the input buffer is long enough to hold them. */ if (buf_len > MAX_INSN_SIZE) buf_len = MAX_INSN_SIZE; memset(insn, 0, sizeof(*insn)); insn->kaddr = kaddr; insn->end_kaddr = kaddr + buf_len; insn->next_byte = kaddr; insn->x86_64 = x86_64 ? 1 : 0; insn->opnd_bytes = 4; if (x86_64) insn->addr_bytes = 8; else insn->addr_bytes = 4; } static const insn_byte_t xen_prefix[] = { __XEN_EMULATE_PREFIX }; static const insn_byte_t kvm_prefix[] = { __KVM_EMULATE_PREFIX }; static int __insn_get_emulate_prefix(struct insn *insn, const insn_byte_t *prefix, size_t len) { size_t i; for (i = 0; i < len; i++) { if (peek_nbyte_next(insn_byte_t, insn, i) != prefix[i]) goto err_out; } insn->emulate_prefix_size = len; insn->next_byte += len; return 1; err_out: return 0; } static void insn_get_emulate_prefix(struct insn *insn) { if (__insn_get_emulate_prefix(insn, xen_prefix, sizeof(xen_prefix))) return; __insn_get_emulate_prefix(insn, kvm_prefix, sizeof(kvm_prefix)); } /** * insn_get_prefixes - scan x86 instruction prefix bytes * @insn: &struct insn containing instruction * * Populates the @insn->prefixes bitmap, and updates @insn->next_byte * to point to the (first) opcode. No effect if @insn->prefixes.got * is already set. * * * Returns: * 0: on success * < 0: on error */ int insn_get_prefixes(struct insn *insn) { struct insn_field *prefixes = &insn->prefixes; insn_attr_t attr; insn_byte_t b, lb; int i, nb; if (prefixes->got) return 0; insn_get_emulate_prefix(insn); nb = 0; lb = 0; b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); while (inat_is_legacy_prefix(attr)) { /* Skip if same prefix */ for (i = 0; i < nb; i++) if (prefixes->bytes[i] == b) goto found; if (nb == 4) /* Invalid instruction */ break; prefixes->bytes[nb++] = b; if (inat_is_address_size_prefix(attr)) { /* address size switches 2/4 or 4/8 */ if (insn->x86_64) insn->addr_bytes ^= 12; else insn->addr_bytes ^= 6; } else if (inat_is_operand_size_prefix(attr)) { /* oprand size switches 2/4 */ insn->opnd_bytes ^= 6; } found: prefixes->nbytes++; insn->next_byte++; lb = b; b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); } /* Set the last prefix */ if (lb && lb != insn->prefixes.bytes[3]) { if (unlikely(insn->prefixes.bytes[3])) { /* Swap the last prefix */ b = insn->prefixes.bytes[3]; for (i = 0; i < nb; i++) if (prefixes->bytes[i] == lb) insn_set_byte(prefixes, i, b); } insn_set_byte(&insn->prefixes, 3, lb); } /* Decode REX prefix */ if (insn->x86_64) { b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); if (inat_is_rex_prefix(attr)) { insn_field_set(&insn->rex_prefix, b, 1); insn->next_byte++; if (X86_REX_W(b)) /* REX.W overrides opnd_size */ insn->opnd_bytes = 8; } } insn->rex_prefix.got = 1; /* Decode VEX prefix */ b = peek_next(insn_byte_t, insn); attr = inat_get_opcode_attribute(b); if (inat_is_vex_prefix(attr)) { insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1); if (!insn->x86_64) { /* * In 32-bits mode, if the [7:6] bits (mod bits of * ModRM) on the second byte are not 11b, it is * LDS or LES or BOUND. */ if (X86_MODRM_MOD(b2) != 3) goto vex_end; } insn_set_byte(&insn->vex_prefix, 0, b); insn_set_byte(&insn->vex_prefix, 1, b2); if (inat_is_evex_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); b2 = peek_nbyte_next(insn_byte_t, insn, 3); insn_set_byte(&insn->vex_prefix, 3, b2); insn->vex_prefix.nbytes = 4; insn->next_byte += 4; if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; } else if (inat_is_vex3_prefix(attr)) { b2 = peek_nbyte_next(insn_byte_t, insn, 2); insn_set_byte(&insn->vex_prefix, 2, b2); insn->vex_prefix.nbytes = 3; insn->next_byte += 3; if (insn->x86_64 && X86_VEX_W(b2)) /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; } else { /* * For VEX2, fake VEX3-like byte#2. * Makes it easier to decode vex.W, vex.vvvv, * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. */ insn_set_byte(&insn->vex_prefix, 2, b2 & 0x7f); insn->vex_prefix.nbytes = 2; insn->next_byte += 2; } } vex_end: insn->vex_prefix.got = 1; prefixes->got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_opcode - collect opcode(s) * @insn: &struct insn containing instruction * * Populates @insn->opcode, updates @insn->next_byte to point past the * opcode byte(s), and set @insn->attr (except for groups). * If necessary, first collects any preceding (prefix) bytes. * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got * is already 1. * * Returns: * 0: on success * < 0: on error */ int insn_get_opcode(struct insn *insn) { struct insn_field *opcode = &insn->opcode; int pfx_id, ret; insn_byte_t op; if (opcode->got) return 0; if (!insn->prefixes.got) { ret = insn_get_prefixes(insn); if (ret) return ret; } /* Get first opcode */ op = get_next(insn_byte_t, insn); insn_set_byte(opcode, 0, op); opcode->nbytes = 1; /* Check if there is VEX prefix or not */ if (insn_is_avx(insn)) { insn_byte_t m, p; m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); if ((inat_must_evex(insn->attr) && !insn_is_evex(insn)) || (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr))) { /* This instruction is bad */ insn->attr = 0; return -EINVAL; } /* VEX has only 1 byte for opcode */ goto end; } insn->attr = inat_get_opcode_attribute(op); while (inat_is_escape(insn->attr)) { /* Get escaped opcode */ op = get_next(insn_byte_t, insn); opcode->bytes[opcode->nbytes++] = op; pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_escape_attribute(op, pfx_id, insn->attr); } if (inat_must_vex(insn->attr)) { /* This instruction is bad */ insn->attr = 0; return -EINVAL; } end: opcode->got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_modrm - collect ModRM byte, if any * @insn: &struct insn containing instruction * * Populates @insn->modrm and updates @insn->next_byte to point past the * ModRM byte, if any. If necessary, first collects the preceding bytes * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1. * * Returns: * 0: on success * < 0: on error */ int insn_get_modrm(struct insn *insn) { struct insn_field *modrm = &insn->modrm; insn_byte_t pfx_id, mod; int ret; if (modrm->got) return 0; if (!insn->opcode.got) { ret = insn_get_opcode(insn); if (ret) return ret; } if (inat_has_modrm(insn->attr)) { mod = get_next(insn_byte_t, insn); insn_field_set(modrm, mod, 1); if (inat_is_group(insn->attr)) { pfx_id = insn_last_prefix_id(insn); insn->attr = inat_get_group_attribute(mod, pfx_id, insn->attr); if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) { /* Bad insn */ insn->attr = 0; return -EINVAL; } } } if (insn->x86_64 && inat_is_force64(insn->attr)) insn->opnd_bytes = 8; modrm->got = 1; return 0; err_out: return -ENODATA; } /** * insn_rip_relative() - Does instruction use RIP-relative addressing mode? * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * ModRM byte. No effect if @insn->x86_64 is 0. */ int insn_rip_relative(struct insn *insn) { struct insn_field *modrm = &insn->modrm; int ret; if (!insn->x86_64) return 0; if (!modrm->got) { ret = insn_get_modrm(insn); if (ret) return 0; } /* * For rip-relative instructions, the mod field (top 2 bits) * is zero and the r/m field (bottom 3 bits) is 0x5. */ return (modrm->nbytes && (modrm->bytes[0] & 0xc7) == 0x5); } /** * insn_get_sib() - Get the SIB byte of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * ModRM byte. * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_get_sib(struct insn *insn) { insn_byte_t modrm; int ret; if (insn->sib.got) return 0; if (!insn->modrm.got) { ret = insn_get_modrm(insn); if (ret) return ret; } if (insn->modrm.nbytes) { modrm = insn->modrm.bytes[0]; if (insn->addr_bytes != 2 && X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) { insn_field_set(&insn->sib, get_next(insn_byte_t, insn), 1); } } insn->sib.got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_displacement() - Get the displacement of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * SIB byte. * Displacement value is sign-expanded. * * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_get_displacement(struct insn *insn) { insn_byte_t mod, rm, base; int ret; if (insn->displacement.got) return 0; if (!insn->sib.got) { ret = insn_get_sib(insn); if (ret) return ret; } if (insn->modrm.nbytes) { /* * Interpreting the modrm byte: * mod = 00 - no displacement fields (exceptions below) * mod = 01 - 1-byte displacement field * mod = 10 - displacement field is 4 bytes, or 2 bytes if * address size = 2 (0x67 prefix in 32-bit mode) * mod = 11 - no memory operand * * If address size = 2... * mod = 00, r/m = 110 - displacement field is 2 bytes * * If address size != 2... * mod != 11, r/m = 100 - SIB byte exists * mod = 00, SIB base = 101 - displacement field is 4 bytes * mod = 00, r/m = 101 - rip-relative addressing, displacement * field is 4 bytes */ mod = X86_MODRM_MOD(insn->modrm.value); rm = X86_MODRM_RM(insn->modrm.value); base = X86_SIB_BASE(insn->sib.value); if (mod == 3) goto out; if (mod == 1) { insn_field_set(&insn->displacement, get_next(signed char, insn), 1); } else if (insn->addr_bytes == 2) { if ((mod == 0 && rm == 6) || mod == 2) { insn_field_set(&insn->displacement, get_next(short, insn), 2); } } else { if ((mod == 0 && rm == 5) || mod == 2 || (mod == 0 && base == 5)) { insn_field_set(&insn->displacement, get_next(int, insn), 4); } } } out: insn->displacement.got = 1; return 0; err_out: return -ENODATA; } /* Decode moffset16/32/64. Return 0 if failed */ static int __get_moffset(struct insn *insn) { switch (insn->addr_bytes) { case 2: insn_field_set(&insn->moffset1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->moffset1, get_next(int, insn), 4); break; case 8: insn_field_set(&insn->moffset1, get_next(int, insn), 4); insn_field_set(&insn->moffset2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } insn->moffset1.got = insn->moffset2.got = 1; return 1; err_out: return 0; } /* Decode imm v32(Iz). Return 0 if failed */ static int __get_immv32(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case 4: case 8: insn_field_set(&insn->immediate, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } return 1; err_out: return 0; } /* Decode imm v64(Iv/Ov), Return 0 if failed */ static int __get_immv(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn->immediate1.nbytes = 4; break; case 8: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; default: /* opnd_bytes must be modified manually */ goto err_out; } insn->immediate1.got = insn->immediate2.got = 1; return 1; err_out: return 0; } /* Decode ptr16:16/32(Ap) */ static int __get_immptr(struct insn *insn) { switch (insn->opnd_bytes) { case 2: insn_field_set(&insn->immediate1, get_next(short, insn), 2); break; case 4: insn_field_set(&insn->immediate1, get_next(int, insn), 4); break; case 8: /* ptr16:64 is not exist (no segment) */ return 0; default: /* opnd_bytes must be modified manually */ goto err_out; } insn_field_set(&insn->immediate2, get_next(unsigned short, insn), 2); insn->immediate1.got = insn->immediate2.got = 1; return 1; err_out: return 0; } /** * insn_get_immediate() - Get the immediate in an instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * displacement bytes. * Basically, most of immediates are sign-expanded. Unsigned-value can be * computed by bit masking with ((1 << (nbytes * 8)) - 1) * * Returns: * 0: on success * < 0: on error */ int insn_get_immediate(struct insn *insn) { int ret; if (insn->immediate.got) return 0; if (!insn->displacement.got) { ret = insn_get_displacement(insn); if (ret) return ret; } if (inat_has_moffset(insn->attr)) { if (!__get_moffset(insn)) goto err_out; goto done; } if (!inat_has_immediate(insn->attr)) /* no immediates */ goto done; switch (inat_immediate_size(insn->attr)) { case INAT_IMM_BYTE: insn_field_set(&insn->immediate, get_next(signed char, insn), 1); break; case INAT_IMM_WORD: insn_field_set(&insn->immediate, get_next(short, insn), 2); break; case INAT_IMM_DWORD: insn_field_set(&insn->immediate, get_next(int, insn), 4); break; case INAT_IMM_QWORD: insn_field_set(&insn->immediate1, get_next(int, insn), 4); insn_field_set(&insn->immediate2, get_next(int, insn), 4); break; case INAT_IMM_PTR: if (!__get_immptr(insn)) goto err_out; break; case INAT_IMM_VWORD32: if (!__get_immv32(insn)) goto err_out; break; case INAT_IMM_VWORD: if (!__get_immv(insn)) goto err_out; break; default: /* Here, insn must have an immediate, but failed */ goto err_out; } if (inat_has_second_immediate(insn->attr)) { insn_field_set(&insn->immediate2, get_next(signed char, insn), 1); } done: insn->immediate.got = 1; return 0; err_out: return -ENODATA; } /** * insn_get_length() - Get the length of instruction * @insn: &struct insn containing instruction * * If necessary, first collects the instruction up to and including the * immediates bytes. * * Returns: * - 0 on success * - < 0 on error */ int insn_get_length(struct insn *insn) { int ret; if (insn->length) return 0; if (!insn->immediate.got) { ret = insn_get_immediate(insn); if (ret) return ret; } insn->length = (unsigned char)((unsigned long)insn->next_byte - (unsigned long)insn->kaddr); return 0; } /* Ensure this instruction is decoded completely */ static inline int insn_complete(struct insn *insn) { return insn->opcode.got && insn->modrm.got && insn->sib.got && insn->displacement.got && insn->immediate.got; } /** * insn_decode() - Decode an x86 instruction * @insn: &struct insn to be initialized * @kaddr: address (in kernel memory) of instruction (or copy thereof) * @buf_len: length of the insn buffer at @kaddr * @m: insn mode, see enum insn_mode * * Returns: * 0: if decoding succeeded * < 0: otherwise. */ int insn_decode(struct insn *insn, const void *kaddr, int buf_len, enum insn_mode m) { int ret; /* #define INSN_MODE_KERN -1 __ignore_sync_check__ mode is only valid in the kernel */ if (m == INSN_MODE_KERN) insn_init(insn, kaddr, buf_len, IS_ENABLED(CONFIG_X86_64)); else insn_init(insn, kaddr, buf_len, m == INSN_MODE_64); ret = insn_get_length(insn); if (ret) return ret; if (insn_complete(insn)) return 0; return -EINVAL; }
27 27 27 27 27 27 27 170 170 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PTRACE_H #define _LINUX_PTRACE_H #include <linux/compiler.h> /* For unlikely. */ #include <linux/sched.h> /* For struct task_struct. */ #include <linux/sched/signal.h> /* For send_sig(), same_thread_group(), etc. */ #include <linux/err.h> /* for IS_ERR_VALUE */ #include <linux/bug.h> /* For BUG_ON. */ #include <linux/pid_namespace.h> /* For task_active_pid_ns. */ #include <uapi/linux/ptrace.h> #include <linux/seccomp.h> /* Add sp to seccomp_data, as seccomp is user API, we don't want to modify it */ struct syscall_info { __u64 sp; struct seccomp_data data; }; extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); /* * Ptrace flags * * The owner ship rules for task->ptrace which holds the ptrace * flags is simple. When a task is running it owns it's task->ptrace * flags. When the a task is stopped the ptracer owns task->ptrace. */ #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ #define PT_EVENT_FLAG(event) (1 << (PT_OPT_FLAG_SHIFT + (event))) #define PT_TRACESYSGOOD PT_EVENT_FLAG(0) #define PT_TRACE_FORK PT_EVENT_FLAG(PTRACE_EVENT_FORK) #define PT_TRACE_VFORK PT_EVENT_FLAG(PTRACE_EVENT_VFORK) #define PT_TRACE_CLONE PT_EVENT_FLAG(PTRACE_EVENT_CLONE) #define PT_TRACE_EXEC PT_EVENT_FLAG(PTRACE_EVENT_EXEC) #define PT_TRACE_VFORK_DONE PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE) #define PT_TRACE_EXIT PT_EVENT_FLAG(PTRACE_EVENT_EXIT) #define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP) #define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) #define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) extern long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len); extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len); extern void ptrace_disable(struct task_struct *); extern int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, const struct cred *ptracer_cred); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 #define PTRACE_MODE_FSCREDS 0x08 #define PTRACE_MODE_REALCREDS 0x10 /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) /** * ptrace_may_access - check whether the caller is permitted to access * a target task. * @task: target task * @mode: selects type of access and caller credentials * * Returns true on success, false on denial. * * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must * be set in @mode to specify whether the access was requested through * a filesystem syscall (should use effective capabilities and fsuid * of the caller) or through an explicit syscall such as * process_vm_writev or ptrace (and should use the real credentials). */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); static inline int ptrace_reparented(struct task_struct *child) { return !same_thread_group(child->real_parent, child->parent); } static inline void ptrace_unlink(struct task_struct *child) { if (unlikely(child->ptrace)) __ptrace_unlink(child); } int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long data); int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, unsigned long data); /** * ptrace_parent - return the task that is tracing the given task * @task: task to consider * * Returns %NULL if no one is tracing @task, or the &struct task_struct * pointer to its tracer. * * Must called under rcu_read_lock(). The pointer returned might be kept * live only by RCU. During exec, this may be called with task_lock() held * on @task, still held from when check_unsafe_exec() was called. */ static inline struct task_struct *ptrace_parent(struct task_struct *task) { if (unlikely(task->ptrace)) return rcu_dereference(task->parent); return NULL; } /** * ptrace_event_enabled - test whether a ptrace event is enabled * @task: ptracee of interest * @event: %PTRACE_EVENT_* to test * * Test whether @event is enabled for ptracee @task. * * Returns %true if @event is enabled, %false otherwise. */ static inline bool ptrace_event_enabled(struct task_struct *task, int event) { return task->ptrace & PT_EVENT_FLAG(event); } /** * ptrace_event - possibly stop for a ptrace event notification * @event: %PTRACE_EVENT_* value to report * @message: value for %PTRACE_GETEVENTMSG to return * * Check whether @event is enabled and, if so, report @event and @message * to the ptrace parent. * * Called without locks. */ static inline void ptrace_event(int event, unsigned long message) { if (unlikely(ptrace_event_enabled(current, event))) { current->ptrace_message = message; ptrace_notify((event << 8) | SIGTRAP); } else if (event == PTRACE_EVENT_EXEC) { /* legacy EXEC report via SIGTRAP */ if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED) send_sig(SIGTRAP, current, 0); } } /** * ptrace_event_pid - possibly stop for a ptrace event notification * @event: %PTRACE_EVENT_* value to report * @pid: process identifier for %PTRACE_GETEVENTMSG to return * * Check whether @event is enabled and, if so, report @event and @pid * to the ptrace parent. @pid is reported as the pid_t seen from the * ptrace parent's pid namespace. * * Called without locks. */ static inline void ptrace_event_pid(int event, struct pid *pid) { /* * FIXME: There's a potential race if a ptracer in a different pid * namespace than parent attaches between computing message below and * when we acquire tasklist_lock in ptrace_stop(). If this happens, * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG. */ unsigned long message = 0; struct pid_namespace *ns; rcu_read_lock(); ns = task_active_pid_ns(rcu_dereference(current->parent)); if (ns) message = pid_nr_ns(pid, ns); rcu_read_unlock(); ptrace_event(event, message); } /** * ptrace_init_task - initialize ptrace state for a new child * @child: new child task * @ptrace: true if child should be ptrace'd by parent's tracer * * This is called immediately after adding @child to its parent's children * list. @ptrace is false in the normal case, and true to ptrace @child. * * Called with current's siglock and write_lock_irq(&tasklist_lock) held. */ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) { INIT_LIST_HEAD(&child->ptrace_entry); INIT_LIST_HEAD(&child->ptraced); child->jobctl = 0; child->ptrace = 0; child->parent = child->real_parent; if (unlikely(ptrace) && current->ptrace) { child->ptrace = current->ptrace; __ptrace_link(child, current->parent, current->ptracer_cred); if (child->ptrace & PT_SEIZED) task_set_jobctl_pending(child, JOBCTL_TRAP_STOP); else sigaddset(&child->pending.signal, SIGSTOP); } else child->ptracer_cred = NULL; } /** * ptrace_release_task - final ptrace-related cleanup of a zombie being reaped * @task: task in %EXIT_DEAD state * * Called with write_lock(&tasklist_lock) held. */ static inline void ptrace_release_task(struct task_struct *task) { BUG_ON(!list_empty(&task->ptraced)); ptrace_unlink(task); BUG_ON(!list_empty(&task->ptrace_entry)); } #ifndef force_successful_syscall_return /* * System call handlers that, upon successful completion, need to return a * negative value should call force_successful_syscall_return() right before * returning. On architectures where the syscall convention provides for a * separate error flag (e.g., alpha, ia64, ppc{,64}, sparc{,64}, possibly * others), this macro can be used to ensure that the error flag will not get * set. On architectures which do not support a separate error flag, the macro * is a no-op and the spurious error condition needs to be filtered out by some * other means (e.g., in user-level, by passing an extra argument to the * syscall handler, or something along those lines). */ #define force_successful_syscall_return() do { } while (0) #endif #ifndef is_syscall_success /* * On most systems we can tell if a syscall is a success based on if the retval * is an error value. On some systems like ia64 and powerpc they have different * indicators of success/failure and must define their own. */ #define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs)))) #endif /* * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__. * * These do-nothing inlines are used when the arch does not * implement single-step. The kerneldoc comments are here * to document the interface for all arch definitions. */ #ifndef arch_has_single_step /** * arch_has_single_step - does this CPU support user-mode single-step? * * If this is defined, then there must be function declarations or * inlines for user_enable_single_step() and user_disable_single_step(). * arch_has_single_step() should evaluate to nonzero iff the machine * supports instruction single-step for user mode. * It can be a constant or it can test a CPU feature bit. */ #define arch_has_single_step() (0) /** * user_enable_single_step - single-step in user-mode task * @task: either current or a task stopped in %TASK_TRACED * * This can only be called when arch_has_single_step() has returned nonzero. * Set @task so that when it returns to user mode, it will trap after the * next single instruction executes. If arch_has_block_step() is defined, * this must clear the effects of user_enable_block_step() too. */ static inline void user_enable_single_step(struct task_struct *task) { BUG(); /* This can never be called. */ } /** * user_disable_single_step - cancel user-mode single-step * @task: either current or a task stopped in %TASK_TRACED * * Clear @task of the effects of user_enable_single_step() and * user_enable_block_step(). This can be called whether or not either * of those was ever called on @task, and even if arch_has_single_step() * returned zero. */ static inline void user_disable_single_step(struct task_struct *task) { } #else extern void user_enable_single_step(struct task_struct *); extern void user_disable_single_step(struct task_struct *); #endif /* arch_has_single_step */ #ifndef arch_has_block_step /** * arch_has_block_step - does this CPU support user-mode block-step? * * If this is defined, then there must be a function declaration or inline * for user_enable_block_step(), and arch_has_single_step() must be defined * too. arch_has_block_step() should evaluate to nonzero iff the machine * supports step-until-branch for user mode. It can be a constant or it * can test a CPU feature bit. */ #define arch_has_block_step() (0) /** * user_enable_block_step - step until branch in user-mode task * @task: either current or a task stopped in %TASK_TRACED * * This can only be called when arch_has_block_step() has returned nonzero, * and will never be called when single-instruction stepping is being used. * Set @task so that when it returns to user mode, it will trap after the * next branch or trap taken. */ static inline void user_enable_block_step(struct task_struct *task) { BUG(); /* This can never be called. */ } #else extern void user_enable_block_step(struct task_struct *); #endif /* arch_has_block_step */ #ifdef ARCH_HAS_USER_SINGLE_STEP_REPORT extern void user_single_step_report(struct pt_regs *regs); #else static inline void user_single_step_report(struct pt_regs *regs) { kernel_siginfo_t info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = SI_USER; info.si_pid = 0; info.si_uid = 0; force_sig_info(&info); } #endif #ifndef arch_ptrace_stop_needed /** * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called * @code: current->exit_code value ptrace will stop with * @info: siginfo_t pointer (or %NULL) for signal ptrace will stop with * * This is called with the siglock held, to decide whether or not it's * necessary to release the siglock and call arch_ptrace_stop() with the * same @code and @info arguments. It can be defined to a constant if * arch_ptrace_stop() is never required, or always is. On machines where * this makes sense, it should be defined to a quick test to optimize out * calling arch_ptrace_stop() when it would be superfluous. For example, * if the thread has not been back to user mode since the last stop, the * thread state might indicate that nothing needs to be done. * * This is guaranteed to be invoked once before a task stops for ptrace and * may include arch-specific operations necessary prior to a ptrace stop. */ #define arch_ptrace_stop_needed(code, info) (0) #endif #ifndef arch_ptrace_stop /** * arch_ptrace_stop - Do machine-specific work before stopping for ptrace * @code: current->exit_code value ptrace will stop with * @info: siginfo_t pointer (or %NULL) for signal ptrace will stop with * * This is called with no locks held when arch_ptrace_stop_needed() has * just returned nonzero. It is allowed to block, e.g. for user memory * access. The arch can have machine-specific work to be done before * ptrace stops. On ia64, register backing store gets written back to user * memory here. Since this can be costly (requires dropping the siglock), * we only do it when the arch requires it for this particular stop, as * indicated by arch_ptrace_stop_needed(). */ #define arch_ptrace_stop(code, info) do { } while (0) #endif #ifndef current_pt_regs #define current_pt_regs() task_pt_regs(current) #endif /* * unlike current_pt_regs(), this one is equal to task_pt_regs(current) * on *all* architectures; the only reason to have a per-arch definition * is optimisation. */ #ifndef signal_pt_regs #define signal_pt_regs() task_pt_regs(current) #endif #ifndef current_user_stack_pointer #define current_user_stack_pointer() user_stack_pointer(current_pt_regs()) #endif extern int task_current_syscall(struct task_struct *target, struct syscall_info *info); extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact); #endif
5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 1 4 4 4 4 25 25 25 25 25 25 25 25 1012 1014 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 // SPDX-License-Identifier: GPL-2.0 /* * Common Block IO controller cgroup interface * * Based on ideas and code from CFQ, CFS and BFQ: * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> * * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> * Paolo Valente <paolo.valente@unimore.it> * * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> * Nauman Rafique <nauman@google.com> * * For policy-specific per-blkcg data: * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> * Arianna Avanzini <avanzini.arianna@gmail.com> */ #include <linux/ioprio.h> #include <linux/kdev_t.h> #include <linux/module.h> #include <linux/sched/signal.h> #include <linux/err.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/slab.h> #include <linux/genhd.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/ctype.h> #include <linux/blk-cgroup.h> #include <linux/tracehook.h> #include <linux/psi.h> #include "blk.h" #include "blk-ioprio.h" /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. * blkcg_pol_register_mutex nests outside of it and synchronizes entire * policy [un]register operations including cgroup file additions / * removals. Putting cgroup file registration outside blkcg_pol_mutex * allows grabbing it from cgroup callbacks. */ static DEFINE_MUTEX(blkcg_pol_register_mutex); static DEFINE_MUTEX(blkcg_pol_mutex); struct blkcg blkcg_root; EXPORT_SYMBOL_GPL(blkcg_root); struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; EXPORT_SYMBOL_GPL(blkcg_root_css); static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ bool blkcg_debug_stats = false; static struct workqueue_struct *blkcg_punt_bio_wq; #define BLKG_DESTROY_BATCH_SIZE 64 static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) { return pol && test_bit(pol->plid, q->blkcg_pols); } /** * blkg_free - free a blkg * @blkg: blkg to free * * Free @blkg which may be partially allocated. */ static void blkg_free(struct blkcg_gq *blkg) { int i; if (!blkg) return; for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); free_percpu(blkg->iostat_cpu); percpu_ref_exit(&blkg->refcnt); kfree(blkg); } static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); WARN_ON(!bio_list_empty(&blkg->async_bios)); /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); if (blkg->parent) blkg_put(blkg->parent); blkg_free(blkg); } /* * A group is RCU protected, but having an rcu lock does not mean that one * can access all the fields of blkg and assume these are valid. For * example, don't try to follow throtl_data and request queue links. * * Having a reference to blkg under an rcu allows accesses to only values * local to groups like group stats and group rate limits. */ static void blkg_release(struct percpu_ref *ref) { struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); call_rcu(&blkg->rcu_head, __blkg_release); } static void blkg_async_bio_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, async_bio_work); struct bio_list bios = BIO_EMPTY_LIST; struct bio *bio; struct blk_plug plug; bool need_plug = false; /* as long as there are pending bios, @blkg can't go away */ spin_lock_bh(&blkg->async_bio_lock); bio_list_merge(&bios, &blkg->async_bios); bio_list_init(&blkg->async_bios); spin_unlock_bh(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ if (bios.head && bios.head->bi_next) { need_plug = true; blk_start_plug(&plug); } while ((bio = bio_list_pop(&bios))) submit_bio(bio); if (need_plug) blk_finish_plug(&plug); } /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with * @q: request_queue the new blkg is associated with * @gfp_mask: allocation mask to use * * Allocate a new blkg assocating @blkcg and @q. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, gfp_t gfp_mask) { struct blkcg_gq *blkg; int i, cpu; /* alloc and init base part */ blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); if (!blkg) return NULL; if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) goto err_free; blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); if (!blkg->iostat_cpu) goto err_free; blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); blkg->blkcg = blkcg; u64_stats_init(&blkg->iostat.sync); for_each_possible_cpu(cpu) u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; if (!blkcg_policy_enabled(q, pol)) continue; /* alloc per-policy data and attach it to blkg */ pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); if (!pd) goto err_free; blkg->pd[i] = pd; pd->blkg = blkg; pd->plid = i; } return blkg; err_free: blkg_free(blkg); return NULL; } struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, struct request_queue *q, bool update_hint) { struct blkcg_gq *blkg; /* * Hint didn't match. Look up from the radix tree. Note that the * hint can only be updated under queue_lock as otherwise @blkg * could have already been removed from blkg_tree. The caller is * responsible for grabbing queue_lock if @update_hint. */ blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q == q) { if (update_hint) { lockdep_assert_held(&q->queue_lock); rcu_assign_pointer(blkcg->blkg_hint, blkg); } return blkg; } return NULL; } EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct request_queue *q, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(&q->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ if (blk_queue_dying(q)) { ret = -ENODEV; goto err_free_blkg; } /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { ret = -ENODEV; goto err_free_blkg; } /* allocate */ if (!new_blkg) { new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto err_put_css; } } blkg = new_blkg; /* link parent */ if (blkcg_parent(blkcg)) { blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; goto err_put_css; } blkg_get(blkg->parent); } /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_init_fn) pol->pd_init_fn(blkg->pd[i]); } /* insert */ spin_lock(&blkcg->lock); ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); list_add(&blkg->q_node, &q->blkg_list); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_online_fn) pol->pd_online_fn(blkg->pd[i]); } } blkg->online = true; spin_unlock(&blkcg->lock); if (!ret) return blkg; /* @blkg failed fully initialized, use the usual release path */ blkg_put(blkg); return ERR_PTR(ret); err_put_css: css_put(&blkcg->css); err_free_blkg: blkg_free(new_blkg); return ERR_PTR(ret); } /** * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @q: request_queue of interest * * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and takes @q->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, struct request_queue *q) { struct blkcg_gq *blkg; unsigned long flags; WARN_ON_ONCE(!rcu_read_lock_held()); blkg = blkg_lookup(blkcg, q); if (blkg) return blkg; spin_lock_irqsave(&q->queue_lock, flags); blkg = __blkg_lookup(blkcg, q, true); if (blkg) goto found; /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. Returns the closest * blkg to the intended blkg should blkg_create() fail. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); struct blkcg_gq *ret_blkg = q->root_blkg; while (parent) { blkg = __blkg_lookup(parent, q, false); if (blkg) { /* remember closest blkg */ ret_blkg = blkg; break; } pos = parent; parent = blkcg_parent(parent); } blkg = blkg_create(pos, q, NULL); if (IS_ERR(blkg)) { blkg = ret_blkg; break; } if (pos == blkcg) break; } found: spin_unlock_irqrestore(&q->queue_lock, flags); return blkg; } static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; int i; lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); /* Something wrong if we are trying to remove same group twice */ WARN_ON_ONCE(list_empty(&blkg->q_node)); WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[i]); } blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); list_del_init(&blkg->q_node); hlist_del_init_rcu(&blkg->blkcg_node); /* * Both setting lookup hint to and clearing it from @blkg are done * under queue_lock. If it's not pointing to @blkg now, it never * will. Hint assignment itself can race safely. */ if (rcu_access_pointer(blkcg->blkg_hint) == blkg) rcu_assign_pointer(blkcg->blkg_hint, NULL); /* * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ percpu_ref_kill(&blkg->refcnt); } /** * blkg_destroy_all - destroy all blkgs associated with a request_queue * @q: request_queue of interest * * Destroy all blkgs associated with @q. */ static void blkg_destroy_all(struct request_queue *q) { struct blkcg_gq *blkg, *n; int count = BLKG_DESTROY_BATCH_SIZE; int i; restart: spin_lock_irq(&q->queue_lock); list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); /* * in order to avoid holding the spin lock for too long, release * it when a batch of blkgs are destroyed. */ if (!(--count)) { count = BLKG_DESTROY_BATCH_SIZE; spin_unlock_irq(&q->queue_lock); cond_resched(); goto restart; } } /* * Mark policy deactivated since policy offline has been done, and * the free is scheduled, so future blkcg_deactivate_policy() can * be bypassed */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (pol) __clear_bit(pol->plid, q->blkcg_pols); } q->root_blkg = NULL; spin_unlock_irq(&q->queue_lock); } static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; int i, cpu; mutex_lock(&blkcg_pol_mutex); spin_lock_irq(&blkcg->lock); /* * Note that stat reset is racy - it doesn't synchronize against * stat updates. This is a debug feature which shouldn't exist * anyway. If you get hit by a race, retry. */ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { for_each_possible_cpu(cpu) { struct blkg_iostat_set *bis = per_cpu_ptr(blkg->iostat_cpu, cpu); memset(bis, 0, sizeof(*bis)); } memset(&blkg->iostat, 0, sizeof(blkg->iostat)); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (blkg->pd[i] && pol->pd_reset_stats_fn) pol->pd_reset_stats_fn(blkg->pd[i]); } } spin_unlock_irq(&blkcg->lock); mutex_unlock(&blkcg_pol_mutex); return 0; } const char *blkg_dev_name(struct blkcg_gq *blkg) { if (!blkg->q->disk || !blkg->q->disk->bdi->dev) return NULL; return bdi_dev_name(blkg->q->disk->bdi); } /** * blkcg_print_blkgs - helper for printing per-blkg data * @sf: seq_file to print to * @blkcg: blkcg of interest * @prfill: fill function to print out a blkg * @pol: policy in question * @data: data to be passed to @prfill * @show_total: to print out sum of prfill return values or not * * This function invokes @prfill on each blkg of @blkcg if pd for the * policy specified by @pol exists. @prfill is invoked with @sf, the * policy data and @data and the matching queue lock held. If @show_total * is %true, the sum of the return values from @prfill is printed with * "Total" label at the end. * * This is to be used to construct print functions for * cftype->read_seq_string method. */ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, u64 (*prfill)(struct seq_file *, struct blkg_policy_data *, int), const struct blkcg_policy *pol, int data, bool show_total) { struct blkcg_gq *blkg; u64 total = 0; rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { spin_lock_irq(&blkg->q->queue_lock); if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); if (show_total) seq_printf(sf, "Total %llu\n", (unsigned long long)total); } EXPORT_SYMBOL_GPL(blkcg_print_blkgs); /** * __blkg_prfill_u64 - prfill helper for a single u64 value * @sf: seq_file to print to * @pd: policy private data of interest * @v: value to print * * Print @v to @sf for the device assocaited with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { const char *dname = blkg_dev_name(pd->blkg); if (!dname) return 0; seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); return v; } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); /* Performs queue bypass and policy enabled checks then looks up blkg. */ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, const struct blkcg_policy *pol, struct request_queue *q) { WARN_ON_ONCE(!rcu_read_lock_held()); lockdep_assert_held(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) return ERR_PTR(-EOPNOTSUPP); return __blkg_lookup(blkcg, q, true /* update_hint */); } /** * blkcg_conf_open_bdev - parse and open bdev for per-blkg config update * @inputp: input string pointer * * Parse the device node prefix part, MAJ:MIN, of per-blkg config update * from @input and get and return the matching bdev. *@inputp is * updated to point past the device node prefix. Returns an ERR_PTR() * value on error. * * Use this function iff blkg_conf_prep() can't be used for some reason. */ struct block_device *blkcg_conf_open_bdev(char **inputp) { char *input = *inputp; unsigned int major, minor; struct block_device *bdev; int key_len; if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) return ERR_PTR(-EINVAL); input += key_len; if (!isspace(*input)) return ERR_PTR(-EINVAL); input = skip_spaces(input); bdev = blkdev_get_no_open(MKDEV(major, minor)); if (!bdev) return ERR_PTR(-ENODEV); if (bdev_is_partition(bdev)) { blkdev_put_no_open(bdev); return ERR_PTR(-ENODEV); } *inputp = input; return bdev; } /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy * @input: input string * @ctx: blkg_conf_ctx to be filled * * Parse per-blkg config update from @input and initialize @ctx with the * result. @ctx->blkg points to the blkg to be updated and @ctx->body the * part of @input following MAJ:MIN. This function returns with RCU read * lock and queue lock held and must be paired with blkg_conf_finish(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx) __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock) { struct block_device *bdev; struct request_queue *q; struct blkcg_gq *blkg; int ret; bdev = blkcg_conf_open_bdev(&input); if (IS_ERR(bdev)) return PTR_ERR(bdev); q = bdev->bd_disk->queue; /* * blkcg_deactivate_policy() requires queue to be frozen, we can grab * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). */ ret = blk_queue_enter(q, 0); if (ret) goto fail; rcu_read_lock(); spin_lock_irq(&q->queue_lock); blkg = blkg_lookup_check(blkcg, pol, q); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_unlock; } if (blkg) goto success; /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent; struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); while (parent && !__blkg_lookup(parent, q, false)) { pos = parent; parent = blkcg_parent(parent); } /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); new_blkg = blkg_alloc(pos, q, GFP_KERNEL); if (unlikely(!new_blkg)) { ret = -ENOMEM; goto fail_exit_queue; } if (radix_tree_preload(GFP_KERNEL)) { blkg_free(new_blkg); ret = -ENOMEM; goto fail_exit_queue; } rcu_read_lock(); spin_lock_irq(&q->queue_lock); blkg = blkg_lookup_check(pos, pol, q); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); blkg_free(new_blkg); goto fail_preloaded; } if (blkg) { blkg_free(new_blkg); } else { blkg = blkg_create(pos, q, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); goto fail_preloaded; } } radix_tree_preload_end(); if (pos == blkcg) goto success; } success: blk_queue_exit(q); ctx->bdev = bdev; ctx->blkg = blkg; ctx->body = input; return 0; fail_preloaded: radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); fail_exit_queue: blk_queue_exit(q); fail: blkdev_put_no_open(bdev); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue * can be bypassing for some time and it's always nice to * avoid busy looping. */ if (ret == -EBUSY) { msleep(10); ret = restart_syscall(); } return ret; } EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_finish - finish up per-blkg config update * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() * * Finish up after per-blkg config update. This function must be paired * with blkg_conf_prep(). */ void blkg_conf_finish(struct blkg_conf_ctx *ctx) __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu) { spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock); rcu_read_unlock(); blkdev_put_no_open(ctx->bdev); } EXPORT_SYMBOL_GPL(blkg_conf_finish); static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] = src->bytes[i]; dst->ios[i] = src->ios[i]; } } static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] += src->bytes[i]; dst->ios[i] += src->ios[i]; } } static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) { int i; for (i = 0; i < BLKG_IOSTAT_NR; i++) { dst->bytes[i] -= src->bytes[i]; dst->ios[i] -= src->ios[i]; } } static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(css->cgroup)) return; rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { struct blkcg_gq *parent = blkg->parent; struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); struct blkg_iostat cur, delta; unsigned long flags; unsigned int seq; /* fetch the current per-cpu values */ do { seq = u64_stats_fetch_begin(&bisc->sync); blkg_iostat_set(&cur, &bisc->cur); } while (u64_stats_fetch_retry(&bisc->sync, seq)); /* propagate percpu delta to global */ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&delta, &cur); blkg_iostat_sub(&delta, &bisc->last); blkg_iostat_add(&blkg->iostat.cur, &delta); blkg_iostat_add(&bisc->last, &delta); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); /* propagate global delta to parent (unless that's root) */ if (parent && parent->parent) { flags = u64_stats_update_begin_irqsave(&parent->iostat.sync); blkg_iostat_set(&delta, &blkg->iostat.cur); blkg_iostat_sub(&delta, &blkg->iostat.last); blkg_iostat_add(&parent->iostat.cur, &delta); blkg_iostat_add(&blkg->iostat.last, &delta); u64_stats_update_end_irqrestore(&parent->iostat.sync, flags); } } rcu_read_unlock(); } /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no * cgroups are defined. For that reason, cgroup_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * * However, we would like to re-use the printing code between the root and * non-root cgroups to the extent possible. For that reason, we simulate * flushing the root cgroup's stats by explicitly filling in the iostat * with disk level statistics. */ static void blkcg_fill_root_iostats(void) { struct class_dev_iter iter; struct device *dev; class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); struct blkcg_gq *blkg = blk_queue_root_blkg(bdev->bd_disk->queue); struct blkg_iostat tmp; int cpu; unsigned long flags; memset(&tmp, 0, sizeof(tmp)); for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += cpu_dkstats->ios[STAT_WRITE]; tmp.ios[BLKG_IOSTAT_DISCARD] += cpu_dkstats->ios[STAT_DISCARD]; // convert sectors to bytes tmp.bytes[BLKG_IOSTAT_READ] += cpu_dkstats->sectors[STAT_READ] << 9; tmp.bytes[BLKG_IOSTAT_WRITE] += cpu_dkstats->sectors[STAT_WRITE] << 9; tmp.bytes[BLKG_IOSTAT_DISCARD] += cpu_dkstats->sectors[STAT_DISCARD] << 9; } flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); blkg_iostat_set(&blkg->iostat.cur, &tmp); u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } class_dev_iter_exit(&iter); } static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) { struct blkg_iostat_set *bis = &blkg->iostat; u64 rbytes, wbytes, rios, wios, dbytes, dios; bool has_stats = false; const char *dname; unsigned seq; int i; if (!blkg->online) return; dname = blkg_dev_name(blkg); if (!dname) return; seq_printf(s, "%s ", dname); do { seq = u64_stats_fetch_begin(&bis->sync); rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; rios = bis->cur.ios[BLKG_IOSTAT_READ]; wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; } while (u64_stats_fetch_retry(&bis->sync, seq)); if (rbytes || wbytes || rios || wios) { has_stats = true; seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", rbytes, wbytes, rios, wios, dbytes, dios); } if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { has_stats = true; seq_printf(s, " use_delay=%d delay_nsec=%llu", atomic_read(&blkg->use_delay), atomic64_read(&blkg->delay_nsec)); } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; if (!blkg->pd[i] || !pol->pd_stat_fn) continue; if (pol->pd_stat_fn(blkg->pd[i], s)) has_stats = true; } if (has_stats) seq_printf(s, "\n"); } static int blkcg_print_stat(struct seq_file *sf, void *v) { struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct blkcg_gq *blkg; if (!seq_css(sf)->parent) blkcg_fill_root_iostats(); else cgroup_rstat_flush(blkcg->css.cgroup); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { spin_lock_irq(&blkg->q->queue_lock); blkcg_print_one_stat(blkg, sf); spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); return 0; } static struct cftype blkcg_files[] = { { .name = "stat", .seq_show = blkcg_print_stat, }, { } /* terminate */ }; static struct cftype blkcg_legacy_files[] = { { .name = "reset_stats", .write_u64 = blkcg_reset_stats, }, { } /* terminate */ }; /* * blkcg destruction is a three-stage process. * * 1. Destruction starts. The blkcg_css_offline() callback is invoked * which offlines writeback. Here we tie the next stage of blkg destruction * to the completion of writeback associated with the blkcg. This lets us * avoid punting potentially large amounts of outstanding writeback to root * while maintaining any ongoing policies. The next stage is triggered when * the nr_cgwbs count goes to zero. * * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called * and handles the destruction of blkgs. Here the css reference held by * the blkg is put back eventually allowing blkcg_css_free() to be called. * This work may occur in cgwb_release_workfn() on the cgwb_release * workqueue. Any submitted ios that fail to get the blkg ref will be * punted to the root_blkg. * * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. * This finally frees the blkcg. */ /** * blkcg_css_offline - cgroup css_offline callback * @css: css of interest * * This function is called when @css is about to go away. Here the cgwbs are * offlined first and only once writeback associated with the blkcg has * finished do we start step 2 (see above). */ static void blkcg_css_offline(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); /* this prevents anyone from attaching or migrating to this blkcg */ wb_blkcg_offline(blkcg); /* put the base online pin allowing step 2 to be triggered */ blkcg_unpin_online(blkcg); } /** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest * * blkgs should be removed while holding both q and blkcg locks. As blkcg lock * is nested inside q lock, this function performs reverse double lock dancing. * Destroying the blkgs releases the reference held on the blkcg's css allowing * blkcg_css_free to eventually be called. * * This is the blkcg counterpart of ioc_release_fn(). */ void blkcg_destroy_blkgs(struct blkcg *blkcg) { might_sleep(); spin_lock_irq(&blkcg->lock); while (!hlist_empty(&blkcg->blkg_list)) { struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, struct blkcg_gq, blkcg_node); struct request_queue *q = blkg->q; if (need_resched() || !spin_trylock(&q->queue_lock)) { /* * Given that the system can accumulate a huge number * of blkgs in pathological cases, check to see if we * need to rescheduling to avoid softlockup. */ spin_unlock_irq(&blkcg->lock); cond_resched(); spin_lock_irq(&blkcg->lock); continue; } blkg_destroy(blkg); spin_unlock(&q->queue_lock); } spin_unlock_irq(&blkcg->lock); } static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); int i; mutex_lock(&blkcg_pol_mutex); list_del(&blkcg->all_blkcgs_node); for (i = 0; i < BLKCG_MAX_POLS; i++) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); mutex_unlock(&blkcg_pol_mutex); kfree(blkcg); } static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; struct cgroup_subsys_state *ret; int i; mutex_lock(&blkcg_pol_mutex); if (!parent_css) { blkcg = &blkcg_root; } else { blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); if (!blkcg) { ret = ERR_PTR(-ENOMEM); goto unlock; } } for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; /* * If the policy hasn't been attached yet, wait for it * to be attached before doing anything else. Otherwise, * check if the policy requires any specific per-cgroup * data: if it does, allocate and initialize it. */ if (!pol || !pol->cpd_alloc_fn) continue; cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) { ret = ERR_PTR(-ENOMEM); goto free_pd_blkcg; } blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; if (pol->cpd_init_fn) pol->cpd_init_fn(cpd); } spin_lock_init(&blkcg->lock); refcount_set(&blkcg->online_pin, 1); INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); INIT_HLIST_HEAD(&blkcg->blkg_list); #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&blkcg->cgwb_list); #endif list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); mutex_unlock(&blkcg_pol_mutex); return &blkcg->css; free_pd_blkcg: for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); if (blkcg != &blkcg_root) kfree(blkcg); unlock: mutex_unlock(&blkcg_pol_mutex); return ret; } static int blkcg_css_online(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); struct blkcg *parent = blkcg_parent(blkcg); /* * blkcg_pin_online() is used to delay blkcg offline so that blkgs * don't go offline while cgwbs are still active on them. Pin the * parent so that offline always happens towards the root. */ if (parent) blkcg_pin_online(parent); return 0; } /** * blkcg_init_queue - initialize blkcg part of request queue * @q: request_queue to initialize * * Called from blk_alloc_queue(). Responsible for initializing blkcg * part of new request_queue @q. * * RETURNS: * 0 on success, -errno on failure. */ int blkcg_init_queue(struct request_queue *q) { struct blkcg_gq *new_blkg, *blkg; bool preloaded; int ret; new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); if (!new_blkg) return -ENOMEM; preloaded = !radix_tree_preload(GFP_KERNEL); /* Make sure the root blkg exists. */ rcu_read_lock(); spin_lock_irq(&q->queue_lock); blkg = blkg_create(&blkcg_root, q, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); ret = blk_ioprio_init(q); if (ret) goto err_destroy_all; ret = blk_throtl_init(q); if (ret) goto err_destroy_all; ret = blk_iolatency_init(q); if (ret) { blk_throtl_exit(q); goto err_destroy_all; } return 0; err_destroy_all: blkg_destroy_all(q); return ret; err_unlock: spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); return PTR_ERR(blkg); } /** * blkcg_exit_queue - exit and release blkcg part of request_queue * @q: request_queue being released * * Called from blk_exit_queue(). Responsible for exiting blkcg part. */ void blkcg_exit_queue(struct request_queue *q) { blkg_destroy_all(q); blk_throtl_exit(q); } static void blkcg_bind(struct cgroup_subsys_state *root_css) { int i; mutex_lock(&blkcg_pol_mutex); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg *blkcg; if (!pol || !pol->cpd_bind_fn) continue; list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) if (blkcg->cpd[pol->plid]) pol->cpd_bind_fn(blkcg->cpd[pol->plid]); } mutex_unlock(&blkcg_pol_mutex); } static void blkcg_exit(struct task_struct *tsk) { if (tsk->throttle_queue) blk_put_queue(tsk->throttle_queue); tsk->throttle_queue = NULL; } struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .css_rstat_flush = blkcg_rstat_flush, .bind = blkcg_bind, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", .exit = blkcg_exit, #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled * together on the default hierarchy so that the owner cgroup can * be retrieved from writeback pages. */ .depends_on = 1 << memory_cgrp_id, #endif }; EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a request_queue * @q: request_queue of interest * @pol: blkcg policy to activate * * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through * bypass mode to populate its blkgs with policy_data for @pol. * * Activation happens with @q bypassed, so nobody would be accessing blkgs * from IO path. Update of each blkg is protected by both queue and blkcg * locks so that holding either lock and testing blkcg_policy_enabled() is * always enough for dereferencing policy data. * * The caller is responsible for synchronizing [de]activations and policy * [un]registerations. Returns 0 on success, -errno on failure. */ int blkcg_activate_policy(struct request_queue *q, const struct blkcg_policy *pol) { struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; int ret; if (blkcg_policy_enabled(q, pol)) return 0; if (queue_is_mq(q)) blk_mq_freeze_queue(q); retry: spin_lock_irq(&q->queue_lock); /* blkg_list is pushed at the head, reverse walk to allocate parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; if (blkg->pd[pol->plid]) continue; /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ if (blkg == pinned_blkg) { pd = pd_prealloc; pd_prealloc = NULL; } else { pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, blkg->blkcg); } if (!pd) { /* * GFP_NOWAIT failed. Free the existing one and * prealloc for @blkg w/ GFP_KERNEL. */ if (pinned_blkg) blkg_put(pinned_blkg); blkg_get(blkg); pinned_blkg = blkg; spin_unlock_irq(&q->queue_lock); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, blkg->blkcg); if (pd_prealloc) goto retry; else goto enomem; } blkg->pd[pol->plid] = pd; pd->blkg = blkg; pd->plid = pol->plid; } /* all allocated, init in the same order */ if (pol->pd_init_fn) list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) pol->pd_init_fn(blkg->pd[pol->plid]); if (pol->pd_online_fn) list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) pol->pd_online_fn(blkg->pd[pol->plid]); __set_bit(pol->plid, q->blkcg_pols); ret = 0; spin_unlock_irq(&q->queue_lock); out: if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); if (pinned_blkg) blkg_put(pinned_blkg); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; enomem: /* alloc failed, nothing's initialized yet, free everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); ret = -ENOMEM; goto out; } EXPORT_SYMBOL_GPL(blkcg_activate_policy); /** * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue * @q: request_queue of interest * @pol: blkcg policy to deactivate * * Deactivate @pol on @q. Follows the same synchronization rules as * blkcg_activate_policy(). */ void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { struct blkcg_gq *blkg; if (!blkcg_policy_enabled(q, pol)) return; if (queue_is_mq(q)) blk_mq_freeze_queue(q); spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { if (pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); /** * blkcg_policy_register - register a blkcg policy * @pol: blkcg policy to register * * Register @pol with blkcg core. Might sleep and @pol may be modified on * successful registration. Returns 0 on success and -errno on failure. */ int blkcg_policy_register(struct blkcg_policy *pol) { struct blkcg *blkcg; int i, ret; mutex_lock(&blkcg_pol_register_mutex); mutex_lock(&blkcg_pol_mutex); /* find an empty slot */ ret = -ENOSPC; for (i = 0; i < BLKCG_MAX_POLS; i++) if (!blkcg_policy[i]) break; if (i >= BLKCG_MAX_POLS) { pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); goto err_unlock; } /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) goto err_unlock; /* register @pol */ pol->plid = i; blkcg_policy[pol->plid] = pol; /* allocate and install cpd's */ if (pol->cpd_alloc_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { struct blkcg_policy_data *cpd; cpd = pol->cpd_alloc_fn(GFP_KERNEL); if (!cpd) goto err_free_cpds; blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; cpd->plid = pol->plid; if (pol->cpd_init_fn) pol->cpd_init_fn(cpd); } } mutex_unlock(&blkcg_pol_mutex); /* everything is in place, add intf files for the new policy */ if (pol->dfl_cftypes) WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, pol->dfl_cftypes)); if (pol->legacy_cftypes) WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->legacy_cftypes)); mutex_unlock(&blkcg_pol_register_mutex); return 0; err_free_cpds: if (pol->cpd_free_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); blkcg->cpd[pol->plid] = NULL; } } } blkcg_policy[pol->plid] = NULL; err_unlock: mutex_unlock(&blkcg_pol_mutex); mutex_unlock(&blkcg_pol_register_mutex); return ret; } EXPORT_SYMBOL_GPL(blkcg_policy_register); /** * blkcg_policy_unregister - unregister a blkcg policy * @pol: blkcg policy to unregister * * Undo blkcg_policy_register(@pol). Might sleep. */ void blkcg_policy_unregister(struct blkcg_policy *pol) { struct blkcg *blkcg; mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) goto out_unlock; /* kill the intf files first */ if (pol->dfl_cftypes) cgroup_rm_cftypes(pol->dfl_cftypes); if (pol->legacy_cftypes) cgroup_rm_cftypes(pol->legacy_cftypes); /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); if (pol->cpd_free_fn) { list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { if (blkcg->cpd[pol->plid]) { pol->cpd_free_fn(blkcg->cpd[pol->plid]); blkcg->cpd[pol->plid] = NULL; } } } blkcg_policy[pol->plid] = NULL; mutex_unlock(&blkcg_pol_mutex); out_unlock: mutex_unlock(&blkcg_pol_register_mutex); } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); bool __blkcg_punt_bio_submit(struct bio *bio) { struct blkcg_gq *blkg = bio->bi_blkg; /* consume the flag first */ bio->bi_opf &= ~REQ_CGROUP_PUNT; /* never bounce for the root cgroup */ if (!blkg->parent) return false; spin_lock_bh(&blkg->async_bio_lock); bio_list_add(&blkg->async_bios, bio); spin_unlock_bh(&blkg->async_bio_lock); queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); return true; } /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a * while since we added delay, and when we are checking to see if we need to * delay a task, to account for any delays that may have occurred. */ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) { u64 old = atomic64_read(&blkg->delay_start); /* negative use_delay means no scaling, see blkcg_set_delay() */ if (atomic_read(&blkg->use_delay) < 0) return; /* * We only want to scale down every second. The idea here is that we * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain * time window. We only want to throttle tasks for recent delay that * has occurred, in 1 second time windows since that's the maximum * things can be throttled. We save the current delay window in * blkg->last_delay so we know what amount is still left to be charged * to the blkg from this point onward. blkg->last_use keeps track of * the use_delay counter. The idea is if we're unthrottling the blkg we * are ok with whatever is happening now, and we can take away more of * the accumulated delay as we've already throttled enough that * everybody is happy with their IO latencies. */ if (time_before64(old + NSEC_PER_SEC, now) && atomic64_cmpxchg(&blkg->delay_start, old, now) == old) { u64 cur = atomic64_read(&blkg->delay_nsec); u64 sub = min_t(u64, blkg->last_delay, now - old); int cur_use = atomic_read(&blkg->use_delay); /* * We've been unthrottled, subtract a larger chunk of our * accumulated delay. */ if (cur_use < blkg->last_use) sub = max_t(u64, sub, blkg->last_delay >> 1); /* * This shouldn't happen, but handle it anyway. Our delay_nsec * should only ever be growing except here where we subtract out * min(last_delay, 1 second), but lord knows bugs happen and I'd * rather not end up with negative numbers. */ if (unlikely(cur < sub)) { atomic64_set(&blkg->delay_nsec, 0); blkg->last_delay = 0; } else { atomic64_sub(sub, &blkg->delay_nsec); blkg->last_delay = cur - sub; } blkg->last_use = cur_use; } } /* * This is called when we want to actually walk up the hierarchy and check to * see if we need to throttle, and then actually throttle if there is some * accumulated delay. This should only be called upon return to user space so * we're not holding some lock that would induce a priority inversion. */ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; bool clamp; u64 now = ktime_to_ns(ktime_get()); u64 exp; u64 delay_nsec = 0; int tok; while (blkg->parent) { int use_delay = atomic_read(&blkg->use_delay); if (use_delay) { u64 this_delay; blkcg_scale_delay(blkg, now); this_delay = atomic64_read(&blkg->delay_nsec); if (this_delay > delay_nsec) { delay_nsec = this_delay; clamp = use_delay > 0; } } blkg = blkg->parent; } if (!delay_nsec) return; /* * Let's not sleep for all eternity if we've amassed a huge delay. * Swapping or metadata IO can accumulate 10's of seconds worth of * delay, and we want userspace to be able to do _something_ so cap the * delays at 0.25s. If there's 10's of seconds worth of delay then the * tasks will be delayed for 0.25 second for every syscall. If * blkcg_set_delay() was used as indicated by negative use_delay, the * caller is responsible for regulating the range. */ if (clamp) delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); if (use_memdelay) psi_memstall_enter(&pflags); exp = ktime_add_ns(now, delay_nsec); tok = io_schedule_prepare(); do { __set_current_state(TASK_KILLABLE); if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) break; } while (!fatal_signal_pending(current)); io_schedule_finish(tok); if (use_memdelay) psi_memstall_leave(&pflags); } /** * blkcg_maybe_throttle_current - throttle the current task if it has been marked * * This is only called if we've been marked with set_notify_resume(). Obviously * we can be set_notify_resume() for reasons other than blkcg throttling, so we * check to see if current->throttle_queue is set and if not this doesn't do * anything. This should only ever be called by the resume code, it's not meant * to be called by people willy-nilly as it will actually do the work to * throttle the task if it is setup for throttling. */ void blkcg_maybe_throttle_current(void) { struct request_queue *q = current->throttle_queue; struct cgroup_subsys_state *css; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; if (!q) return; current->throttle_queue = NULL; current->use_memdelay = false; rcu_read_lock(); css = kthread_blkcg(); if (css) blkcg = css_to_blkcg(css); else blkcg = css_to_blkcg(task_css(current, io_cgrp_id)); if (!blkcg) goto out; blkg = blkg_lookup(blkcg, q); if (!blkg) goto out; if (!blkg_tryget(blkg)) goto out; rcu_read_unlock(); blkcg_maybe_throttle_blkg(blkg, use_memdelay); blkg_put(blkg); blk_put_queue(q); return; out: rcu_read_unlock(); blk_put_queue(q); } /** * blkcg_schedule_throttle - this task needs to check for throttling * @q: the request queue IO was submitted on * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for * instance will only have a request_queue at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * * We will only schedule once per syscall. You can call this over and over * again and it will only do the check once upon return to user space, and only * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { if (unlikely(current->flags & PF_KTHREAD)) return; if (current->throttle_queue != q) { if (!blk_get_queue(q)) return; if (current->throttle_queue) blk_put_queue(current->throttle_queue); current->throttle_queue = q; } if (use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); } /** * blkcg_add_delay - add delay to this blkg * @blkg: blkg of interest * @now: the current time in nanoseconds * @delta: how many nanoseconds of delay to add * * Charge @delta to the blkg's current delay accumulation. This is used to * throttle tasks if an IO controller thinks we need more throttling. */ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) { if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) return; blkcg_scale_delay(blkg, now); atomic64_add(delta, &blkg->delay_nsec); } /** * blkg_tryget_closest - try and get a blkg ref on the closet blkg * @bio: target bio * @css: target css * * As the failure mode here is to walk up the blkg tree, this ensure that the * blkg->parent pointers are always valid. This returns the blkg that it ended * up taking a reference on or %NULL if no reference was taken. */ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct cgroup_subsys_state *css) { struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk->queue); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; break; } blkg = blkg->parent; } rcu_read_unlock(); return ret_blkg; } /** * bio_associate_blkg_from_css - associate a bio with a specified css * @bio: target bio * @css: target css * * Associate @bio with the blkg found by combining the css's blkg and the * request_queue of the @bio. An association failure is handled by walking up * the blkg tree. Therefore, the blkg associated can be anything between @blkg * and q->root_blkg. This situation only happens when a cgroup is dying and * then the remaining bios will spill to the closest alive blkg. * * A reference will be taken on the blkg and will be released when @bio is * freed. */ void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { if (bio->bi_blkg) blkg_put(bio->bi_blkg); if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg); bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); /** * bio_associate_blkg - associate a bio with a blkg * @bio: target bio * * Associate @bio with the blkg found from the bio's css and request_queue. * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is * already associated, the css is reused and association redone as the * request_queue may have changed. */ void bio_associate_blkg(struct bio *bio) { struct cgroup_subsys_state *css; rcu_read_lock(); if (bio->bi_blkg) css = &bio_blkcg(bio)->css; else css = blkcg_css(); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(bio_associate_blkg); /** * bio_clone_blkg_association - clone blkg association from src to dst bio * @dst: destination bio * @src: source bio */ void bio_clone_blkg_association(struct bio *dst, struct bio *src) { if (src->bi_blkg) bio_associate_blkg_from_css(dst, &bio_blkcg(src)->css); } EXPORT_SYMBOL_GPL(bio_clone_blkg_association); static int blk_cgroup_io_type(struct bio *bio) { if (op_is_discard(bio->bi_opf)) return BLKG_IOSTAT_DISCARD; if (op_is_write(bio->bi_opf)) return BLKG_IOSTAT_WRITE; return BLKG_IOSTAT_READ; } void blk_cgroup_bio_start(struct bio *bio) { int rwd = blk_cgroup_io_type(bio), cpu; struct blkg_iostat_set *bis; unsigned long flags; cpu = get_cpu(); bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); flags = u64_stats_update_begin_irqsave(&bis->sync); /* * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split * bio and we would have already accounted for the size of the bio. */ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { bio_set_flag(bio, BIO_CGROUP_ACCT); bis->cur.bytes[rwd] += bio->bi_iter.bi_size; } bis->cur.ios[rwd]++; u64_stats_update_end_irqrestore(&bis->sync, flags); if (cgroup_subsys_on_dfl(io_cgrp_subsys)) cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); put_cpu(); } static int __init blkcg_init(void) { blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND | WQ_SYSFS, 0); if (!blkcg_punt_bio_wq) return -ENOMEM; return 0; } subsys_initcall(blkcg_init); module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
438 3584 3189 233 68 71 8 14 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Berkeley style UIO structures - Alan Cox 1994. */ #ifndef __LINUX_UIO_H #define __LINUX_UIO_H #include <linux/kernel.h> #include <linux/thread_info.h> #include <uapi/linux/uio.h> struct page; struct pipe_inode_info; struct kvec { void *iov_base; /* and that should *never* hold a userland pointer */ size_t iov_len; }; enum iter_type { /* iter types */ ITER_IOVEC, ITER_KVEC, ITER_BVEC, ITER_PIPE, ITER_XARRAY, ITER_DISCARD, }; struct iov_iter_state { size_t iov_offset; size_t count; unsigned long nr_segs; }; struct iov_iter { u8 iter_type; bool nofault; bool data_source; size_t iov_offset; size_t count; union { const struct iovec *iov; const struct kvec *kvec; const struct bio_vec *bvec; struct xarray *xarray; struct pipe_inode_info *pipe; }; union { unsigned long nr_segs; struct { unsigned int head; unsigned int start_head; }; loff_t xarray_start; }; }; static inline enum iter_type iov_iter_type(const struct iov_iter *i) { return i->iter_type; } static inline void iov_iter_save_state(struct iov_iter *iter, struct iov_iter_state *state) { state->iov_offset = iter->iov_offset; state->count = iter->count; state->nr_segs = iter->nr_segs; } static inline bool iter_is_iovec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_IOVEC; } static inline bool iov_iter_is_kvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_KVEC; } static inline bool iov_iter_is_bvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_BVEC; } static inline bool iov_iter_is_pipe(const struct iov_iter *i) { return iov_iter_type(i) == ITER_PIPE; } static inline bool iov_iter_is_discard(const struct iov_iter *i) { return iov_iter_type(i) == ITER_DISCARD; } static inline bool iov_iter_is_xarray(const struct iov_iter *i) { return iov_iter_type(i) == ITER_XARRAY; } static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->data_source ? WRITE : READ; } /* * Total number of bytes covered by an iovec. * * NOTE that it is not safe to use this function until all the iovec's * segment lengths have been validated. Because the individual lengths can * overflow a size_t when added together. */ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) { unsigned long seg; size_t ret = 0; for (seg = 0; seg < nr_segs; seg++) ret += iov[seg].iov_len; return ret; } static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) { return (struct iovec) { .iov_base = iter->iov->iov_base + iter->iov_offset, .iov_len = min(iter->count, iter->iov->iov_len - iter->iov_offset), }; } size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, true))) return 0; else return _copy_to_iter(addr, bytes, i); } static __always_inline __must_check size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, false))) return 0; else return _copy_from_iter(addr, bytes, i); } static __always_inline __must_check bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, false))) return 0; else return _copy_from_iter_nocache(addr, bytes, i); } static __always_inline __must_check bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter_nocache(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /* * Note, users like pmem that depend on the stricter semantics of * copy_from_iter_flushcache() than copy_from_iter_nocache() must check for * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the * destination is flushed from the cache on return. */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_from_iter_flushcache _copy_from_iter_nocache #endif #ifdef CONFIG_ARCH_HAS_COPY_MC size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_mc_to_iter _copy_to_iter #endif static __always_inline __must_check size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, false))) return 0; else return _copy_from_iter_flushcache(addr, bytes, i); } static __always_inline __must_check size_t copy_mc_to_iter(void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, true))) return 0; else return _copy_mc_to_iter(addr, bytes, i); } size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags); static inline size_t iov_iter_count(const struct iov_iter *i) { return i->count; } /* * Cap the iov_iter by given limit; note that the second argument is * *not* the new size - it's upper limit for such. Passing it a value * greater than the amount of data in iov_iter is fine - it'll just do * nothing in that case. */ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) { /* * count doesn't have to fit in size_t - comparison extends both * operands to u64 here and any value that would be truncated by * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ if (i->count > count) i->count = count; } /* * reexpand a previously truncated iterator; count must be no more than how much * we had shrunk it. */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { i->count = count; } struct csum_state { __wsum csum; size_t off; }; size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); static __always_inline __must_check bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i) { size_t copied = csum_and_copy_from_iter(addr, bytes, csum, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, struct iov_iter *i); struct iovec *iovec_from_user(const struct iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat); ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i); ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); int import_single_range(int type, void __user *buf, size_t len, struct iovec *iov, struct iov_iter *i); #endif
235 2 237 237 236 3 237 833 836 237 201 201 161 161 161 161 161 160 161 2 161 161 17 31 227 161 161 231 232 232 232 1 232 1 237 237 237 237 200 200 12 50 161 237 201 237 237 1 234 237 237 237 237 232 366 368 1 201 277 278 163 161 831 832 831 833 833 366 201 206 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * * In contrast to the low-resolution timeout API, aka timer wheel, * hrtimers provide finer resolution and accuracy depending on system * configuration and capabilities. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * Based on the original timer wheel code * * Help, testing, suggestions, bugfixes, improvements were * provided by: * * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel * et. al. */ #include <linux/cpu.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> #include <linux/interrupt.h> #include <linux/tick.h> #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <trace/events/timer.h> #include "tick-internal.h" /* * Masks for selecting the soft and hard context timers from * cpu_base->active */ #define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) #define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) /* * The timer bases: * * There are more clockids than hrtimer bases. Thus, we index * into the timer bases by the hrtimer_base_type enum. When trying * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { /* Make sure we catch unsupported clockids */ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; /* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ #ifdef CONFIG_SMP /* * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() * such that hrtimer_callback_running() can unconditionally dereference * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { .clock_base = { { .cpu_base = &migration_cpu_base, .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, &migration_cpu_base.lock), }, }, }; #define migration_base migration_cpu_base.clock_base[0] static inline bool is_migration_base(struct hrtimer_clock_base *base) { return base == &migration_base; } /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is * possible to set timer->base = &migration_base and drop the lock: the timer * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { struct hrtimer_clock_base *base; for (;;) { base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } } /* * We do not migrate the timer when it is expiring before the next * event on the target cpu. When high resolution is enabled, we cannot * reprogram the target cpu hardware and we would cause it to fire * late. To keep it simple, we handle the high resolution enabled and * disabled case similar. * * Called with cpu_base->lock of target cpu held. */ static int hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); return expires < new_base->cpu_base->expires_next; } static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); #endif return base; } /* * We switch the timer base to a power-optimized selected CPU target, * if: * - NO_HZ_COMMON is enabled * - timer migration is enabled * - the timer callback is not running * - the timer is not the first expiring timer on the new target * * If one of the above requirements is not fulfilled we move the timer * to the current CPU or leave it on the previously assigned CPU if * the timer callback is currently running. */ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; int basenum = base->index; this_cpu_base = this_cpu_ptr(&hrtimer_bases); new_cpu_base = get_target_base(this_cpu_base, pinned); again: new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* * We are trying to move timer to new_base. * However we can't change timer's base while it is running, * so we keep it on the same CPU. No hassle vs. reprogramming * the event source in the high resolution case. The softirq * code will take care of this when the timer function has * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ if (unlikely(hrtimer_callback_running(timer))) return base; /* See the comment in lock_hrtimer_base() */ WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; WRITE_ONCE(timer->base, base); goto again; } WRITE_ONCE(timer->base, new_base); } else { if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { new_cpu_base = this_cpu_base; goto again; } } return new_base; } #else /* CONFIG_SMP */ static inline bool is_migration_base(struct hrtimer_clock_base *base) { return false; } static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { struct hrtimer_clock_base *base = timer->base; raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } # define switch_hrtimer_base(t, b, p) (b) #endif /* !CONFIG_SMP */ /* * Functions for the union type storage format of ktime_t which are * too large for inlining: */ #if BITS_PER_LONG < 64 /* * Divide a ktime value by a nanosecond value */ s64 __ktime_divns(const ktime_t kt, s64 div) { int sft = 0; s64 dclc; u64 tmp; dclc = ktime_to_ns(kt); tmp = dclc < 0 ? -dclc : dclc; /* Make sure the divisor is less than 2^32: */ while (div >> 32) { sft++; div >>= 1; } tmp >>= sft; do_div(tmp, (u32) div); return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* * Add two ktime values and do a safety check for overflow: */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can * return to user space in a timespec: */ if (res < 0 || res < lhs || res < rhs) res = ktime_set(KTIME_SEC_MAX, 0); return res; } EXPORT_SYMBOL_GPL(ktime_add_safe); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { return ((struct hrtimer *) addr)->function; } /* * fixup_init is called when: * - an active object is initialized */ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); return true; default: return false; } } /* * fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); fallthrough; default: return false; } } /* * fixup_free is called when: * - an active object is freed */ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); return true; default: return false; } } static const struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, }; static inline void debug_hrtimer_init(struct hrtimer *timer) { debug_object_init(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { debug_object_deactivate(timer, &hrtimer_debug_descr); } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode); void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { debug_object_init_on_stack(timer, &hrtimer_debug_descr); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); __hrtimer_init_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); } EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif static inline void debug_init(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); trace_hrtimer_init(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_hrtimer_activate(timer, mode); trace_hrtimer_start(timer, mode); } static inline void debug_deactivate(struct hrtimer *timer) { debug_hrtimer_deactivate(timer); trace_hrtimer_cancel(timer); } static struct hrtimer_clock_base * __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) { unsigned int idx; if (!*active) return NULL; idx = __ffs(*active); *active &= ~(1U << idx); return &cpu_base->clock_base[idx]; } #define for_each_active_base(base, cpu_base, active) \ while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { struct hrtimer_clock_base *base; ktime_t expires; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); if (timer == exclude) { /* Get to the next timer in the queue. */ next = timerqueue_iterate_next(next); if (!next) continue; timer = container_of(next, struct hrtimer, node); } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; /* Skip cpu_base update if a timer is being excluded. */ if (exclude) continue; if (timer->is_soft) cpu_base->softirq_next_timer = timer; else cpu_base->next_timer = timer; } } /* * clock_was_set() might have changed base->offset of any of * the clock bases so the result might be negative. Fix it up * to prevent a false positive in clockevents_program_event(). */ if (expires_next < 0) expires_next = 0; return expires_next; } /* * Recomputes cpu_base::*next_timer and returns the earliest expires_next * but does not set cpu_base::*expires_next, that is done by * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating * cpu_base::*expires_next right away, reprogramming logic would no longer * work. * * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, * those timers will get run whenever the softirq gets handled, at the end of * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. * * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. * * @active_mask must be one of: * - HRTIMER_ACTIVE_ALL, * - HRTIMER_ACTIVE_SOFT, or * - HRTIMER_ACTIVE_HARD. */ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) { unsigned int active; struct hrtimer *next_timer = NULL; ktime_t expires_next = KTIME_MAX; if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, expires_next); } return expires_next; } static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) { ktime_t expires_next, soft = KTIME_MAX; /* * If the soft interrupt has already been activated, ignore the * soft bases. They will be handled in the already raised soft * interrupt. */ if (!cpu_base->softirq_activated) { soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * Update the soft expiry time. clock_settime() might have * affected it. */ cpu_base->softirq_expires_next = soft; } expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); /* * If a softirq timer is expiring first, update cpu_base->next_timer * and program the hardware with the soft expiry time. */ if (expires_next > soft) { cpu_base->next_timer = cpu_base->softirq_next_timer; expires_next = soft; } return expires_next; } static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; } /* * Is the high resolution mode active ? */ static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? cpu_base->hres_active : 0; } static inline int hrtimer_hres_active(void) { return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } static void __hrtimer_reprogram(struct hrtimer_cpu_base *cpu_base, struct hrtimer *next_timer, ktime_t expires_next) { cpu_base->expires_next = expires_next; /* * If hres is not active, hardware does not have to be * reprogrammed yet. * * If a hang was detected in the last timer interrupt then we * leave the hang delay active in the hardware. We want the * system to make progress. That also prevents the following * scenario: * T1 expires 50ms from now * T2 expires 5s from now * * T1 is removed, so this code is called and would reprogram * the hardware to 5s from now. Any hrtimer_start after that * will not reprogram the hardware due to hang_detected being * set. So we'd effectively block all timers until the T2 event * fires. */ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(expires_next, 1); } /* * Reprogram the event source with checking both queues for the * next event * Called with interrupts disabled and base->lock held */ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; __hrtimer_reprogram(cpu_base, cpu_base->next_timer, expires_next); } /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer enabled ? */ static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode */ static int __init setup_hrtimer_hres(char *str) { return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); /* * hrtimer_high_res_enabled - query, if the highres mode is enabled */ static inline int hrtimer_is_hres_enabled(void) { return hrtimer_hres_enabled; } static void retrigger_next_event(void *arg); /* * Switch to high resolution mode */ static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } base->hres_active = 1; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); } #else static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } #endif /* CONFIG_HIGH_RES_TIMERS */ /* * Retrigger next event is called after clock was set with interrupts * disabled through an SMP function call or directly from low level * resume code. * * This is only invoked when: * - CONFIG_HIGH_RES_TIMERS is enabled. * - CONFIG_NOHZ_COMMON is enabled * * For the other cases this function is empty and because the call sites * are optimized out it vanishes as well, i.e. no need for lots of * #ifdeffery. */ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); /* * When high resolution mode or nohz is active, then the offsets of * CLOCK_REALTIME/TAI/BOOTTIME have to be updated. Otherwise the * next tick will take care of that. * * If high resolution mode is active then the next expiring timer * must be reevaluated and the clock event device reprogrammed if * necessary. * * In the NOHZ case the update of the offset and the reevaluation * of the next expiring timer is enough. The return from the SMP * function call will take care of the reprogramming in case the * CPU was in a NOHZ idle sleep. */ if (!__hrtimer_hres_active(base) && !tick_nohz_active) return; raw_spin_lock(&base->lock); hrtimer_update_base(base); if (__hrtimer_hres_active(base)) hrtimer_force_reprogram(base, 0); else hrtimer_update_next_event(base); raw_spin_unlock(&base->lock); } /* * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * * Called with interrupts disabled and base->cpu_base.lock held */ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = timer->base; ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Set it to 0. */ if (expires < 0) expires = 0; if (timer->is_soft) { /* * soft hrtimer could be started on a remote CPU. In this * case softirq_expires_next needs to be updated on the * remote CPU. The soft hrtimer will not expire before the * first hard hrtimer on the remote CPU - * hrtimer_check_target() prevents this case. */ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; if (timer_cpu_base->softirq_activated) return; if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) return; timer_cpu_base->softirq_next_timer = timer; timer_cpu_base->softirq_expires_next = expires; if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) return; } /* * If the timer is not on the current cpu, we cannot reprogram * the other cpus clock event device. */ if (base->cpu_base != cpu_base) return; if (expires >= cpu_base->expires_next) return; /* * If the hrtimer interrupt is running, then it will reevaluate the * clock bases and reprogram the clock event device. */ if (cpu_base->in_hrtirq) return; cpu_base->next_timer = timer; __hrtimer_reprogram(cpu_base, timer, expires); } static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int active) { struct hrtimer_clock_base *base; unsigned int seq; ktime_t expires; /* * Update the base offsets unconditionally so the following * checks whether the SMP function call is required works. * * The update is safe even when the remote CPU is in the hrtimer * interrupt or the hrtimer soft interrupt and expiring affected * bases. Either it will see the update before handling a base or * it will see it when it finishes the processing and reevaluates * the next expiring timer. */ seq = cpu_base->clock_was_set_seq; hrtimer_update_base(cpu_base); /* * If the sequence did not change over the update then the * remote CPU already handled it. */ if (seq == cpu_base->clock_was_set_seq) return false; /* * If the remote CPU is currently handling an hrtimer interrupt, it * will reevaluate the first expiring timer of all clock bases * before reprogramming. Nothing to do here. */ if (cpu_base->in_hrtirq) return false; /* * Walk the affected clock bases and check whether the first expiring * timer in a clock base is moving ahead of the first expiring timer of * @cpu_base. If so, the IPI must be invoked because per CPU clock * event devices cannot be remotely reprogrammed. */ active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; next = timerqueue_getnext(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; /* Extra check for softirq clock bases */ if (base->clockid < HRTIMER_BASE_MONOTONIC_SOFT) continue; if (cpu_base->softirq_activated) continue; if (expires < cpu_base->softirq_expires_next) return true; } return false; } /* * Clock was set. This might affect CLOCK_REALTIME, CLOCK_TAI and * CLOCK_BOOTTIME (for late sleep time injection). * * This requires to update the offsets for these clocks * vs. CLOCK_MONOTONIC. When high resolution timers are enabled, then this * also requires to eventually reprogram the per CPU clock event devices * when the change moves an affected timer ahead of the first expiring * timer on that CPU. Obviously remote per CPU clock event devices cannot * be reprogrammed. The other reason why an IPI has to be sent is when the * system is in !HIGH_RES and NOHZ mode. The NOHZ mode updates the offsets * in the tick, which obviously might be stopped, so this has to bring out * the remote CPU which might sleep in idle to get this sorted. */ void clock_was_set(unsigned int bases) { struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; if (!__hrtimer_hres_active(cpu_base) && !tick_nohz_active) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { on_each_cpu(retrigger_next_event, NULL, 1); goto out_timerfd; } /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { unsigned long flags; cpu_base = &per_cpu(hrtimer_bases, cpu); raw_spin_lock_irqsave(&cpu_base->lock, flags); if (update_needs_ipi(cpu_base, bases)) cpumask_set_cpu(cpu, mask); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } preempt_disable(); smp_call_function_many(mask, retrigger_next_event, NULL, 1); preempt_enable(); cpus_read_unlock(); free_cpumask_var(mask); out_timerfd: timerfd_clock_was_set(); } static void clock_was_set_work(struct work_struct *work) { clock_was_set(CLOCK_SET_WALL); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* * Called from timekeeping code to reprogram the hrtimer interrupt device * on all cpus and to notify timerfd. */ void clock_was_set_delayed(void) { schedule_work(&hrtimer_work); } /* * Called during resume either directly from via timekeeping_resume() * or in the case of s2idle from tick_unfreeze() to ensure that the * hrtimers are up to date. */ void hrtimers_resume_local(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); } /* * Counterpart to lock_hrtimer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** * hrtimer_forward - forward the timer expiry * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward * * Forward the timer expiry so it will expire in the future. * Returns the number of overruns. * * Can be safely called from the callback function of @timer. If * called from other contexts @timer must neither be enqueued nor * running the callback and the caller needs to take care of * serialization. * * Note: This only updates the timer expiry value and does not requeue * the timer. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { u64 orun = 1; ktime_t delta; delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta < 0) return 0; if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) return 0; if (interval < hrtimer_resolution) interval = hrtimer_resolution; if (unlikely(delta >= interval)) { s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); hrtimer_add_expires_ns(timer, incr * orun); if (hrtimer_get_expires_tv64(timer) > now) return orun; /* * This (and the ktime_add() below) is the * correction for exact: */ orun++; } hrtimer_add_expires(timer, interval); return orun; } EXPORT_SYMBOL_GPL(hrtimer_forward); /* * enqueue_hrtimer - internal function to (re)start a timer * * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * * Returns 1 when the new timer is the leftmost timer in the tree. */ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, enum hrtimer_mode mode) { debug_activate(timer, mode); WARN_ON_ONCE(!base->cpu_base->online); base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } /* * __remove_hrtimer - internal function to remove a timer * * Caller must hold the base lock. * * High resolution timer mode reprograms the clock event device when the * timer is the one which expires next. The caller can disable this by setting * reprogram to zero. This is useful, when the context does a reprogramming * anyway (e.g. timer interrupt) */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); /* * Note: If reprogram is false we do not update * cpu_base->next_timer. This happens when we remove the first * timer on a remote cpu. No harm as we never dereference * cpu_base->next_timer. So the worst thing what can happen is * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); } /* * remove hrtimer, called with base lock held */ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { bool reprogram; /* * Remove the timer and force reprogramming when high * resolution mode is active and the timer is on the current * CPU. If we remove a timer on another CPU, reprogramming is * skipped. The interrupt event on this CPU is fired and * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* * If the timer is not restarted then reprogramming is * required if the timer is local. If it is local and about * to be restarted, avoid programming it twice (on removal * and a moment later when it's requeued). */ if (!restart) state = HRTIMER_STATE_INACTIVE; else reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { #ifdef CONFIG_TIME_LOW_RES /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution * (i.e. one jiffie) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) tim = ktime_add_safe(tim, hrtimer_resolution); #endif return tim; } static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { ktime_t expires; /* * Find the next SOFT expiration. */ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * reprogramming needs to be triggered, even if the next soft * hrtimer expires at the same time than the next hard * hrtimer. cpu_base->softirq_expires_next needs to be updated! */ if (expires == KTIME_MAX) return; /* * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() * cpu_base->*expires_next is only set by hrtimer_reprogram() */ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); } static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; bool force_local, first; /* * If the timer is on the local cpu base and is the first expiring * timer then this might end up reprogramming the hardware twice * (on removal and on enqueue). To avoid that by prevent the * reprogram on removal, keep the timer local to the current CPU * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); force_local &= base->cpu_base->next_timer == timer; /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. * * If it's on the current CPU and the first expiring timer, then * skip reprogramming, keep the timer local and enforce * reprogramming later if it was the first expiring timer. This * avoids programming the underlying clock event twice (once at * removal and once after enqueue). */ remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ if (!force_local) { new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); } else { new_base = base; } first = enqueue_hrtimer(timer, new_base, mode); if (!force_local) return first; /* * Timer was forced to stay on the current CPU to avoid * reprogramming on removal and enqueue. Force reprogram the * hardware by evaluating the new first expiring timer. */ hrtimer_force_reprogram(new_base->cpu_base, 1); return 0; } /** * hrtimer_start_range_ns - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base; unsigned long flags; if (WARN_ON_ONCE(!timer->function)) return; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard * expiry mode because unmarked timers are moved to softirq expiry. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); else WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); base = lock_hrtimer_base(timer, &flags); if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) hrtimer_reprogram(timer, true); unlock_hrtimer_base(timer, &flags); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop * * Returns: * * * 0 when the timer was not active * * 1 when the timer was active * * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; /* * Check lockless first. If the timer is not active (neither * enqueued nor running the callback, nothing to do here. The * base lock does not serialize against a concurrent enqueue, * so we can avoid taking it. */ if (!hrtimer_active(timer)) return 0; base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); return ret; } EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); #ifdef CONFIG_PREEMPT_RT static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { spin_lock_init(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { spin_lock(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { spin_unlock(&base->softirq_expiry_lock); } /* * The counterpart to hrtimer_cancel_wait_running(). * * If there is a waiter for cpu_base->expiry_lock, then it was waiting for * the timer callback to finish. Drop expiry_lock and reacquire it. That * allows the waiter to acquire the lock and make progress. */ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) { if (atomic_read(&cpu_base->timer_waiters)) { raw_spin_unlock_irqrestore(&cpu_base->lock, flags); spin_unlock(&cpu_base->softirq_expiry_lock); spin_lock(&cpu_base->softirq_expiry_lock); raw_spin_lock_irq(&cpu_base->lock); } } /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion: if the soft irq thread is preempted * in the middle of a timer callback, then calling del_timer_sync() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer * handler to complete. This can result in unbound priority inversion. * * - If the caller originates from the task which preempted the timer * handler on the same CPU, then spin waiting for the timer handler to * complete is never going to end. */ void hrtimer_cancel_wait_running(const struct hrtimer *timer) { /* Lockless read. Prevent the compiler from reloading it below */ struct hrtimer_clock_base *base = READ_ONCE(timer->base); /* * Just relax if the timer expires in hard interrupt context or if * it is currently on the migration base. */ if (!timer->is_soft || is_migration_base(base)) { cpu_relax(); return; } /* * Mark the base as contended and grab the expiry lock, which is * held by the softirq across the timer callback. Drop the lock * immediately so the softirq can expire the next timer. In theory * the timer could already be running again, but that's more than * unlikely and just causes another wait loop. */ atomic_inc(&base->cpu_base->timer_waiters); spin_lock_bh(&base->cpu_base->softirq_expiry_lock); atomic_dec(&base->cpu_base->timer_waiters); spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); } #else static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long flags) { } #endif /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. * @timer: the timer to be cancelled * * Returns: * 0 when the timer was not active * 1 when the timer was active */ int hrtimer_cancel(struct hrtimer *timer) { int ret; do { ret = hrtimer_try_to_cancel(timer); if (ret < 0) hrtimer_cancel_wait_running(timer); } while (ret < 0); return ret; } EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * __hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) rem = hrtimer_expires_remaining_adjusted(timer); else rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * * Returns the next expiry time or KTIME_MAX if no timer is pending. */ u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!__hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } /** * hrtimer_next_event_without - time until next expiry event w/o one timer * @exclude: timer to exclude * * Returns the next expiry time over all timers except for the @exclude one or * KTIME_MAX if none of them is pending. */ u64 hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (__hrtimer_hres_active(cpu_base)) { unsigned int active; if (!cpu_base->softirq_activated) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; expires = __hrtimer_next_event_base(cpu_base, exclude, active, KTIME_MAX); } active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; expires = __hrtimer_next_event_base(cpu_base, exclude, active, expires); } raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) { if (likely(clock_id < MAX_CLOCKS)) { int base = hrtimer_clock_to_base_table[clock_id]; if (likely(base != HRTIMER_MAX_CLOCK_BASES)) return base; } WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); return HRTIMER_BASE_MONOTONIC; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; int base; /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context for latency reasons and because the callbacks * can invoke functions which might sleep on RT, e.g. spin_lock(). */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) softtimer = true; memset(timer, 0, sizeof(struct hrtimer)); cpu_base = raw_cpu_ptr(&hrtimer_bases); /* * POSIX magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they needs to become CLOCK_MONOTONIC to * ensure POSIX compliance. */ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } /** * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: The modes which are relevant for initialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * * The PINNED variants of the above can be handed in, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); /* * A timer is active, when it is enqueued into the rbtree or the * callback function is running or it's in the state of being migrated * to another cpu. * * It is important for this function to not return a false negative. */ bool hrtimer_active(const struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned int seq; do { base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) return true; } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); return false; } EXPORT_SYMBOL_GPL(hrtimer_active); /* * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 * distinct sections: * * - queued: the timer is queued * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * * On the read side we ensure we observe timer->state and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. * * The sequence numbers are required because otherwise we could still observe * a false negative if the read side got smeared over multiple consecutive * __run_hrtimer() invocations. */ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now, unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); bool expires_in_hardirq; int restart; lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); base->running = timer; /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); fn = timer->function; /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the * timer is restarted with a period then it becomes an absolute * timer. If its not restarted it does not matter. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES)) timer->is_rel = false; /* * The timer is marked as running in the CPU base, so it is * protected against migration to a different CPU even if the lock * is dropped. */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); expires_in_hardirq = lockdep_hrtimer_enter(timer); restart = fn(timer); lockdep_hrtimer_exit(expires_in_hardirq); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); /* * Note: We clear the running state after enqueue_hrtimer and * we do not reprogram the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * * Note: Because we dropped the cpu_base->lock above, * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); WARN_ON_ONCE(base->running != timer); base->running = NULL; } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, unsigned long flags, unsigned int active_mask) { struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases & active_mask; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; ktime_t basenow; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; timer = container_of(node, struct hrtimer, node); /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the * earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search * Tree, which can answer a stabbing query for * overlapping intervals and instead use the simple * BST we already have. * We don't add extra wakeups by delaying timers that * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ if (basenow < hrtimer_get_softexpires_tv64(timer)) break; __run_hrtimer(cpu_base, base, timer, &basenow, flags); if (active_mask == HRTIMER_ACTIVE_SOFT) hrtimer_sync_wait_running(cpu_base, flags); } } } static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; hrtimer_cpu_base_lock_expiry(cpu_base); raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); cpu_base->softirq_activated = 0; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); hrtimer_cpu_base_unlock_expiry(cpu_base); } #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer interrupt * Called with interrupts disabled */ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; unsigned long flags; int retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: cpu_base->in_hrtirq = 1; /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via * the migration code. This does not affect enqueueing of * timers which run their callback and need to be requeued on * this CPU. */ cpu_base->expires_next = KTIME_MAX; if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_softirq_irqoff(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); /* Reevaluate the clock bases for the [soft] next expiry */ expires_next = hrtimer_update_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; cpu_base->in_hrtirq = 0; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; return; } /* * The next timer was already expired due to: * - tracing * - long lasting callbacks * - being scheduled away when running in a VM * * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. * * Acquire base lock for updating the offsets and retrieving * the current time. */ raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; /* * Give the system a chance to do something else than looping * here. We stored the entry time, so we know exactly how long * we spent here. We schedule the next event this amount of * time away. */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); delta = ktime_sub(now, entry_time); if ((unsigned int)delta > cpu_base->max_hang_time) cpu_base->max_hang_time = (unsigned int) delta; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. */ if (delta > 100 * NSEC_PER_MSEC) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); else expires_next = ktime_add(now, delta); tick_program_event(expires_next, 1); pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } /* called with interrupts disabled */ static inline void __hrtimer_peek_ahead_timers(void) { struct tick_device *td; if (!hrtimer_hres_active()) return; td = this_cpu_ptr(&tick_cpu_device); if (td && td->evtdev) hrtimer_interrupt(td->evtdev); } #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from run_local_timers in hardirq context every jiffy */ void hrtimer_run_queues(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; if (__hrtimer_hres_active(cpu_base)) return; /* * This _is_ ugly: We have to check periodically, whether we * can switch to highres and / or nohz mode. The clocksource * switch happens with xtime_lock held. Notification from * there only sets the check bit in the tick_oneshot code, * otherwise we might deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { hrtimer_switch_to_hres(); return; } raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_softirq_irqoff(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } /* * Sleep related functions: */ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); struct task_struct *task = t->task; t->task = NULL; if (task) wake_up_process(task); return HRTIMER_NORESTART; } /** * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer * @sl: sleeper to be started * @mode: timer mode abs/rel * * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) */ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) { /* * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because * hrtimer_init_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) mode |= HRTIMER_MODE_HARD; hrtimer_start_expires(&sl->timer, mode); } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitly * marked for hard interrupt expiry mode are moved into soft * interrupt context either for latency reasons or because the * hrtimer callback takes regular spinlocks or invokes other * functions which are not suitable for hard interrupt context on * PREEMPT_RT. * * The hrtimer_sleeper callback is RT compatible in hard interrupt * context, but there is a latency concern: Untrusted userspace can * spawn many threads which arm timers for the same expiry time on * the same CPU. That causes a latency spike due to the wakeup of * a gazillion threads. * * OTOH, privileged real-time user space applications rely on the * low latency of hard interrupt wakeups. If the current task is in * a real-time scheduling class, mark the mode for hard interrupt * expiry. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) mode |= HRTIMER_MODE_HARD; } __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; sl->task = current; } /** * hrtimer_init_sleeper - initialize sleeper to the given clock * @sl: sleeper to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel */ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_init(&sl->timer, clock_id, mode); __hrtimer_init_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif case TT_NATIVE: if (put_timespec64(ts, restart->nanosleep.rmtp)) return -EFAULT; break; default: BUG(); } return -ERESTART_RESTARTBLOCK; } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { struct restart_block *restart; do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) freezable_schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; } while (t->task && !signal_pending(current)); __set_current_state(TASK_RUNNING); if (!t->task) return 0; restart = &current->restart_block; if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); struct timespec64 rmt; if (rem <= 0) return 0; rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; int ret; hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; u64 slack; slack = current->timer_slack_ns; if (dl_task(current) || rt_task(current)) slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; /* Absolute timers do not update the rmtp value and restart: */ if (mode == HRTIMER_MODE_ABS) { ret = -ERESTARTNOHAND; goto out; } restart = &current->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); return ret; } #ifdef CONFIG_64BIT SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) { struct timespec64 tu; if (get_timespec64(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { struct timespec64 tu; if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif /* * Functions related to boot-time initialization: */ int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } int hrtimers_cpu_starting(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->online = 1; return 0; } #ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct timerqueue_node *node; while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); /* * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not * reprogram the event device in case the timer * expires before the earliest on this CPU, but we run * hrtimer_interrupt after we migrated everything to * sort out already expired timers and reprogram the * event device. */ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } } int hrtimers_cpu_dying(unsigned int dying_cpu) { struct hrtimer_cpu_base *old_base, *new_base; int i, ncpu = cpumask_first(cpu_active_mask); tick_cancel_sched_timer(dying_cpu); old_base = this_cpu_ptr(&hrtimer_bases); new_base = &per_cpu(hrtimer_bases, ncpu); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ raw_spin_lock(&old_base->lock); raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } /* * The migration might have changed the first expiring softirq * timer on this CPU. Update it. */ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT); /* Tell the other CPU to retrigger the next event */ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); old_base->online = 0; raw_spin_unlock(&old_base->lock); return 0; } #endif /* CONFIG_HOTPLUG_CPU */ void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); hrtimers_cpu_starting(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks * @mode: timer mode * @clock_id: timer clock to be used */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id) { struct hrtimer_sleeper t; /* * Optimize when a zero timeout value is given. It does not * matter whether this is an absolute or a relative time. */ if (expires && *expires == 0) { __set_current_state(TASK_RUNNING); return 0; } /* * A NULL parameter means "infinite" */ if (!expires) { schedule(); return -EINTR; } /* * Override any slack passed by the user if under * rt contraints. */ if (rt_task(current)) delta = 0; hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); if (likely(t.task)) schedule(); hrtimer_cancel(&t.timer); destroy_hrtimer_on_stack(&t.timer); __set_current_state(TASK_RUNNING); return !t.task ? 0 : -EINTR; } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock); /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) for SCHED_OTHER tasks * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * The @delta argument gives the kernel the freedom to schedule the * actual wakeup to a time that is both power and performance friendly * for regular (non RT/DL) tasks. * The kernel give the normal best effort behavior for "@expires+@delta", * but may decide to fire the timer earlier, but no earlier than @expires. * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, CLOCK_MONOTONIC); } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** * schedule_hrtimeout - sleep until timeout * @expires: timeout value (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) { return schedule_hrtimeout_range(expires, 0, mode); } EXPORT_SYMBOL_GPL(schedule_hrtimeout);
663 12447 12454 12445 587 488 240 589 85 85 85 85 11103 11110 10399 10393 10401 3131 3126 3128 3127 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 // SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/sched/cputime.h> static DEFINE_SPINLOCK(cgroup_rstat_lock); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) { return per_cpu_ptr(cgrp->rstat_cpu, cpu); } /** * cgroup_rstat_updated - keep track of updated rstat_cpu * @cgrp: target cgroup * @cpu: cpu on which rstat_cpu was updated * * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching * rstat_cpu->updated_children list. See the comment on top of * cgroup_rstat_cpu definition for details. */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) { raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); unsigned long flags; /* * Speculative already-on-list test. This may race leading to * temporary inaccuracies, which is fine. * * Because @parent's updated_children is terminated with @parent * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. */ if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) return; raw_spin_lock_irqsave(cpu_lock, flags); /* put @cgrp and all ancestors on the corresponding updated lists */ while (true) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *prstatc; /* * Both additions and removals are bottom-up. If a cgroup * is already in the tree, all ancestors are. */ if (rstatc->updated_next) break; /* Root has no parent to link it to, but mark it busy */ if (!parent) { rstatc->updated_next = cgrp; break; } prstatc = cgroup_rstat_cpu(parent, cpu); rstatc->updated_next = prstatc->updated_children; prstatc->updated_children = cgrp; cgrp = parent; } raw_spin_unlock_irqrestore(cpu_lock, flags); } /** * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree * @pos: current position * @root: root of the tree to traversal * @cpu: target cpu * * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts * the traversal and %NULL return indicates the end. During traversal, * each returned cgroup is unlinked from the tree. Must be called with the * matching cgroup_rstat_cpu_lock held. * * The only ordering guarantee is that, for a parent and a child pair * covered by a given traversal, if a child is visited, its parent is * guaranteed to be visited afterwards. */ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, struct cgroup *root, int cpu) { struct cgroup_rstat_cpu *rstatc; if (pos == root) return NULL; /* * We're gonna walk down to the first leaf and visit/remove it. We * can pick whatever unvisited node as the starting point. */ if (!pos) pos = root; else pos = cgroup_parent(pos); /* walk down to the first leaf */ while (true) { rstatc = cgroup_rstat_cpu(pos, cpu); if (rstatc->updated_children == pos) break; pos = rstatc->updated_children; } /* * Unlink @pos from the tree. As the updated_children list is * singly linked, we have to walk it to find the removal point. * However, due to the way we traverse, @pos will be the first * child in most cases. The only exception is @root. */ if (rstatc->updated_next) { struct cgroup *parent = cgroup_parent(pos); if (parent) { struct cgroup_rstat_cpu *prstatc; struct cgroup **nextp; prstatc = cgroup_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; while (true) { struct cgroup_rstat_cpu *nrstatc; nrstatc = cgroup_rstat_cpu(*nextp, cpu); if (*nextp == pos) break; WARN_ON_ONCE(*nextp == parent); nextp = &nrstatc->updated_next; } *nextp = rstatc->updated_next; } rstatc->updated_next = NULL; return pos; } /* only happens for @root */ return NULL; } /* see cgroup_rstat_flush() */ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) { int cpu; lockdep_assert_held(&cgroup_rstat_lock); for_each_possible_cpu(cpu) { raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *pos = NULL; raw_spin_lock(cpu_lock); while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { struct cgroup_subsys_state *css; cgroup_base_stat_flush(pos, cpu); rcu_read_lock(); list_for_each_entry_rcu(css, &pos->rstat_css_list, rstat_css_node) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } raw_spin_unlock(cpu_lock); /* if @may_sleep, play nice and yield if necessary */ if (may_sleep && (need_resched() || spin_needbreak(&cgroup_rstat_lock))) { spin_unlock_irq(&cgroup_rstat_lock); if (!cond_resched()) cpu_relax(); spin_lock_irq(&cgroup_rstat_lock); } } } /** * cgroup_rstat_flush - flush stats in @cgrp's subtree * @cgrp: target cgroup * * Collect all per-cpu stats in @cgrp's subtree into the global counters * and propagate them upwards. After this function returns, all cgroups in * the subtree have up-to-date ->stat. * * This also gets all cgroups in the subtree including @cgrp off the * ->updated_children lists. * * This function may block. */ void cgroup_rstat_flush(struct cgroup *cgrp) { might_sleep(); spin_lock_irq(&cgroup_rstat_lock); cgroup_rstat_flush_locked(cgrp, true); spin_unlock_irq(&cgroup_rstat_lock); } /** * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() * @cgrp: target cgroup * * This function can be called from any context. */ void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) { unsigned long flags; spin_lock_irqsave(&cgroup_rstat_lock, flags); cgroup_rstat_flush_locked(cgrp, false); spin_unlock_irqrestore(&cgroup_rstat_lock, flags); } /** * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold * @cgrp: target cgroup * * Flush stats in @cgrp's subtree and prevent further flushes. Must be * paired with cgroup_rstat_flush_release(). * * This function may block. */ void cgroup_rstat_flush_hold(struct cgroup *cgrp) __acquires(&cgroup_rstat_lock) { might_sleep(); spin_lock_irq(&cgroup_rstat_lock); cgroup_rstat_flush_locked(cgrp, true); } /** * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() */ void cgroup_rstat_flush_release(void) __releases(&cgroup_rstat_lock) { spin_unlock_irq(&cgroup_rstat_lock); } int cgroup_rstat_init(struct cgroup *cgrp) { int cpu; /* the root cgrp has rstat_cpu preallocated */ if (!cgrp->rstat_cpu) { cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); if (!cgrp->rstat_cpu) return -ENOMEM; } /* ->updated_children list is self terminated */ for_each_possible_cpu(cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); rstatc->updated_children = cgrp; u64_stats_init(&rstatc->bsync); } return 0; } void cgroup_rstat_exit(struct cgroup *cgrp) { int cpu; cgroup_rstat_flush(cgrp); /* sanity check */ for_each_possible_cpu(cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || WARN_ON_ONCE(rstatc->updated_next)) return; } free_percpu(cgrp->rstat_cpu); cgrp->rstat_cpu = NULL; } void __init cgroup_rstat_boot(void) { int cpu; for_each_possible_cpu(cpu) raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); } /* * Functions for cgroup basic resource statistics implemented on top of * rstat. */ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat, struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime += src_bstat->cputime.utime; dst_bstat->cputime.stime += src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; } static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, struct cgroup_base_stat *src_bstat) { dst_bstat->cputime.utime -= src_bstat->cputime.utime; dst_bstat->cputime.stime -= src_bstat->cputime.stime; dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime; } static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_base_stat cur, delta; unsigned seq; /* Root-level stats are sourced from system-wide CPU stats */ if (!parent) return; /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); cur.cputime = rstatc->bstat.cputime; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); /* propagate percpu delta to global */ delta = cur; cgroup_base_stat_sub(&delta, &rstatc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); cgroup_base_stat_add(&rstatc->last_bstat, &delta); /* propagate global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { delta = cgrp->bstat; cgroup_base_stat_sub(&delta, &cgrp->last_bstat); cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); } } static struct cgroup_rstat_cpu * cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { struct cgroup_rstat_cpu *rstatc; rstatc = get_cpu_ptr(cgrp->rstat_cpu); *flags = u64_stats_update_begin_irqsave(&rstatc->bsync); return rstatc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, struct cgroup_rstat_cpu *rstatc, unsigned long flags) { u64_stats_update_end_irqrestore(&rstatc->bsync, flags); cgroup_rstat_updated(cgrp, smp_processor_id()); put_cpu_ptr(rstatc); } void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { struct cgroup_rstat_cpu *rstatc; unsigned long flags; rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); rstatc->bstat.cputime.sum_exec_runtime += delta_exec; cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup_rstat_cpu *rstatc; unsigned long flags; rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { case CPUTIME_USER: case CPUTIME_NICE: rstatc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: rstatc->bstat.cputime.stime += delta_exec; break; default: break; } cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); } /* * compute the cputime for the root cgroup by getting the per cpu data * at a global level, then categorizing the fields in a manner consistent * with how it is done by __cgroup_account_cputime_field for each bit of * cpu time attributed to a cgroup. */ static void root_cgroup_cputime(struct task_cputime *cputime) { int i; cputime->stime = 0; cputime->utime = 0; cputime->sum_exec_runtime = 0; for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; u64 user = 0; u64 sys = 0; kcpustat_cpu_fetch(&kcpustat, i); user += cpustat[CPUTIME_USER]; user += cpustat[CPUTIME_NICE]; cputime->utime += user; sys += cpustat[CPUTIME_SYSTEM]; sys += cpustat[CPUTIME_IRQ]; sys += cpustat[CPUTIME_SOFTIRQ]; cputime->stime += sys; cputime->sum_exec_runtime += user; cputime->sum_exec_runtime += sys; cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL]; } } void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; struct task_cputime cputime; if (cgroup_parent(cgrp)) { cgroup_rstat_flush_hold(cgrp); usage = cgrp->bstat.cputime.sum_exec_runtime; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); cgroup_rstat_flush_release(); } else { root_cgroup_cputime(&cputime); usage = cputime.sum_exec_runtime; utime = cputime.utime; stime = cputime.stime; } do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); do_div(stime, NSEC_PER_USEC); seq_printf(seq, "usage_usec %llu\n" "user_usec %llu\n" "system_usec %llu\n", usage, utime, stime); }
281 281 282 280 280 281 281 918 922 918 295 295 293 295 295 295 22 281 280 282 279 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 // SPDX-License-Identifier: GPL-2.0-only /* * fs/fs-writeback.c * * Copyright (C) 2002, Linus Torvalds. * * Contains all the functions related to writing back and waiting * upon dirty inodes against superblocks, and writing back dirty * pages against inodes. ie: data writeback. Writeout of the * inode itself is not handled here. * * 10Apr2002 Andrew Morton * Split out of fs/inode.c * Additions for address_space-based writeback */ #include <linux/kernel.h> #include <linux/export.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kthread.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/tracepoint.h> #include <linux/device.h> #include <linux/memcontrol.h> #include "internal.h" /* * 4MB minimal write chunk size */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_SHIFT - 10)) /* * Passed into wb_writeback(), essentially a subset of writeback_control */ struct wb_writeback_work { long nr_pages; struct super_block *sb; enum writeback_sync_modes sync_mode; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ unsigned int auto_free:1; /* free on completion */ enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ struct wb_completion *done; /* set if the caller waits */ }; /* * If an inode is constantly having its pages dirtied, but then the * updates stop dirtytime_expire_interval seconds in the past, it's * possible for the worst case time between when an inode has its * timestamps updated and when they finally get written out to be two * dirtytime_expire_intervals. We set the default to 12 hours (in * seconds), which means most of the time inodes will have their * timestamps written to disk after 12 hours, but in the worst case a * few inodes might not their timestamps updated for 24 hours. */ unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { return list_entry(head, struct inode, i_io_list); } /* * Include the creation of the trace points after defining the * wb_writeback_work structure and inline functions so that the definition * remains local to this file. */ #define CREATE_TRACE_POINTS #include <trace/events/writeback.h> EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); static bool wb_io_lists_populated(struct bdi_writeback *wb) { if (wb_has_dirty_io(wb)) { return false; } else { set_bit(WB_has_dirty_io, &wb->state); WARN_ON_ONCE(!wb->avg_write_bandwidth); atomic_long_add(wb->avg_write_bandwidth, &wb->bdi->tot_write_bandwidth); return true; } } static void wb_io_lists_depopulated(struct bdi_writeback *wb) { if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { clear_bit(WB_has_dirty_io, &wb->state); WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, &wb->bdi->tot_write_bandwidth) < 0); } } /** * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list * @inode: inode to be moved * @wb: target bdi_writeback * @head: one of @wb->b_{dirty|io|more_io|dirty_time} * * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. * Returns %true if @inode is the first occupant of the !dirty_time IO * lists; otherwise, %false. */ static bool inode_io_list_move_locked(struct inode *inode, struct bdi_writeback *wb, struct list_head *head) { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); list_move(&inode->i_io_list, head); /* dirty_time doesn't count as dirty_io until expiration */ if (head != &wb->b_dirty_time) return wb_io_lists_populated(wb); wb_io_lists_depopulated(wb); return false; } static void wb_wakeup(struct bdi_writeback *wb) { spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) mod_delayed_work(bdi_wq, &wb->dwork, 0); spin_unlock_irq(&wb->work_lock); } static void finish_writeback_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { struct wb_completion *done = work->done; if (work->auto_free) kfree(work); if (done) { wait_queue_head_t *waitq = done->waitq; /* @done can't be accessed after the following dec */ if (atomic_dec_and_test(&done->cnt)) wake_up_all(waitq); } } static void wb_queue_work(struct bdi_writeback *wb, struct wb_writeback_work *work) { trace_writeback_queue(wb, work); if (work->done) atomic_inc(&work->done->cnt); spin_lock_irq(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) { list_add_tail(&work->list, &wb->work_list); mod_delayed_work(bdi_wq, &wb->dwork, 0); } else finish_writeback_work(wb, work); spin_unlock_irq(&wb->work_lock); } /** * wb_wait_for_completion - wait for completion of bdi_writeback_works * @done: target wb_completion * * Wait for one or more work items issued to @bdi with their ->done field * set to @done, which should have been initialized with * DEFINE_WB_COMPLETION(). This function returns after all such work items * are completed. Work items which are waited upon aren't freed * automatically on completion. */ void wb_wait_for_completion(struct wb_completion *done) { atomic_dec(&done->cnt); /* put down the initial count */ wait_event(*done->waitq, !atomic_read(&done->cnt)); } #ifdef CONFIG_CGROUP_WRITEBACK /* * Parameters for foreign inode detection, see wbc_detach_inode() to see * how they're used. * * These paramters are inherently heuristical as the detection target * itself is fuzzy. All we want to do is detaching an inode from the * current owner if it's being written to by some other cgroups too much. * * The current cgroup writeback is built on the assumption that multiple * cgroups writing to the same inode concurrently is very rare and a mode * of operation which isn't well supported. As such, the goal is not * taking too long when a different cgroup takes over an inode while * avoiding too aggressive flip-flops from occasional foreign writes. * * We record, very roughly, 2s worth of IO time history and if more than * half of that is foreign, trigger the switch. The recording is quantized * to 16 slots. To avoid tiny writes from swinging the decision too much, * writes smaller than 1/8 of avg size are ignored. */ #define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ #define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ #define WB_FRN_TIME_CUT_DIV 8 /* ignore rounds < avg / 8 */ #define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ #define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ #define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) /* each slot's duration is 2s / 16 */ #define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) /* if foreign slots >= 8, switch */ #define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) /* one round can affect upto 5 slots */ #define WB_FRN_MAX_IN_FLIGHT 1024 /* don't queue too many concurrently */ /* * Maximum inodes per isw. A specific value has been chosen to make * struct inode_switch_wbs_context fit into 1024 bytes kmalloc. */ #define WB_MAX_INODES_PER_ISW ((1024UL - sizeof(struct inode_switch_wbs_context)) \ / sizeof(struct inode *)) static atomic_t isw_nr_in_flight = ATOMIC_INIT(0); static struct workqueue_struct *isw_wq; void __inode_attach_wb(struct inode *inode, struct page *page) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct bdi_writeback *wb = NULL; if (inode_cgwb_enabled(inode)) { struct cgroup_subsys_state *memcg_css; if (page) { memcg_css = mem_cgroup_css_from_page(page); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); } else { /* must pin memcg_css, see wb_get_create() */ memcg_css = task_get_css(current, memory_cgrp_id); wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); } } if (!wb) wb = &bdi->wb; /* * There may be multiple instances of this function racing to * update the same inode. Use cmpxchg() to tell the winner. */ if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) wb_put(wb); } EXPORT_SYMBOL_GPL(__inode_attach_wb); /** * inode_cgwb_move_to_attached - put the inode onto wb->b_attached list * @inode: inode of interest with i_lock held * @wb: target bdi_writeback * * Remove the inode from wb's io lists and if necessarily put onto b_attached * list. Only inodes attached to cgwb's are kept on this list. */ static void inode_cgwb_move_to_attached(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); inode->i_state &= ~I_SYNC_QUEUED; if (wb != &wb->bdi->wb) list_move(&inode->i_io_list, &wb->b_attached); else list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } /** * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it * @inode: inode of interest with i_lock held * * Returns @inode's wb with its list_lock held. @inode->i_lock must be * held on entry and is released on return. The returned wb is guaranteed * to stay @inode's associated wb until its list_lock is released. */ static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) __acquires(&wb->list_lock) { while (true) { struct bdi_writeback *wb = inode_to_wb(inode); /* * inode_to_wb() association is protected by both * @inode->i_lock and @wb->list_lock but list_lock nests * outside i_lock. Drop i_lock and verify that the * association hasn't changed after acquiring list_lock. */ wb_get(wb); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); /* i_wb may have changed inbetween, can't use inode_to_wb() */ if (likely(wb == inode->i_wb)) { wb_put(wb); /* @inode already has ref */ return wb; } spin_unlock(&wb->list_lock); wb_put(wb); cpu_relax(); spin_lock(&inode->i_lock); } } /** * inode_to_wb_and_lock_list - determine an inode's wb and lock it * @inode: inode of interest * * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held * on entry. */ static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) __acquires(&wb->list_lock) { spin_lock(&inode->i_lock); return locked_inode_to_wb_and_lock_list(inode); } struct inode_switch_wbs_context { struct rcu_work work; /* * Multiple inodes can be switched at once. The switching procedure * consists of two parts, separated by a RCU grace period. To make * sure that the second part is executed for each inode gone through * the first part, all inode pointers are placed into a NULL-terminated * array embedded into struct inode_switch_wbs_context. Otherwise * an inode could be left in a non-consistent state. */ struct bdi_writeback *new_wb; struct inode *inodes[]; }; static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { down_write(&bdi->wb_switch_rwsem); } static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { up_write(&bdi->wb_switch_rwsem); } static bool inode_do_switch_wbs(struct inode *inode, struct bdi_writeback *old_wb, struct bdi_writeback *new_wb) { struct address_space *mapping = inode->i_mapping; XA_STATE(xas, &mapping->i_pages, 0); struct page *page; bool switched = false; spin_lock(&inode->i_lock); xa_lock_irq(&mapping->i_pages); /* * Once I_FREEING or I_WILL_FREE are visible under i_lock, the eviction * path owns the inode and we shouldn't modify ->i_io_list. */ if (unlikely(inode->i_state & (I_FREEING | I_WILL_FREE))) goto skip_switch; trace_inode_switch_wbs(inode, old_wb, new_wb); /* * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to * pages actually under writeback. */ xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_DIRTY) { if (PageDirty(page)) { dec_wb_stat(old_wb, WB_RECLAIMABLE); inc_wb_stat(new_wb, WB_RECLAIMABLE); } } xas_set(&xas, 0); xas_for_each_marked(&xas, page, ULONG_MAX, PAGECACHE_TAG_WRITEBACK) { WARN_ON_ONCE(!PageWriteback(page)); dec_wb_stat(old_wb, WB_WRITEBACK); inc_wb_stat(new_wb, WB_WRITEBACK); } if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { atomic_dec(&old_wb->writeback_inodes); atomic_inc(&new_wb->writeback_inodes); } wb_get(new_wb); /* * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, * the specific list @inode was on is ignored and the @inode is put on * ->b_dirty which is always correct including from ->b_dirty_time. * The transfer preserves @inode->dirtied_when ordering. If the @inode * was clean, it means it was on the b_attached list, so move it onto * the b_attached list of @new_wb. */ if (!list_empty(&inode->i_io_list)) { inode->i_wb = new_wb; if (inode->i_state & I_DIRTY_ALL) { struct inode *pos; list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) if (time_after_eq(inode->dirtied_when, pos->dirtied_when)) break; inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); } else { inode_cgwb_move_to_attached(inode, new_wb); } } else { inode->i_wb = new_wb; } /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */ inode->i_wb_frn_winner = 0; inode->i_wb_frn_avg_time = 0; inode->i_wb_frn_history = 0; switched = true; skip_switch: /* * Paired with load_acquire in unlocked_inode_to_wb_begin() and * ensures that the new wb is visible if they see !I_WB_SWITCH. */ smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); xa_unlock_irq(&mapping->i_pages); spin_unlock(&inode->i_lock); return switched; } static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(to_rcu_work(work), struct inode_switch_wbs_context, work); struct backing_dev_info *bdi = inode_to_bdi(isw->inodes[0]); struct bdi_writeback *old_wb = isw->inodes[0]->i_wb; struct bdi_writeback *new_wb = isw->new_wb; unsigned long nr_switched = 0; struct inode **inodep; /* * If @inode switches cgwb membership while sync_inodes_sb() is * being issued, sync_inodes_sb() might miss it. Synchronize. */ down_read(&bdi->wb_switch_rwsem); /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions * between unlocked_inode_to_wb_begin/end() are guaranteed to be * synchronizing against the i_pages lock. * * Grabbing old_wb->list_lock, inode->i_lock and the i_pages lock * gives us exclusion against all wb related operations on @inode * including IO list manipulations and stat updates. */ if (old_wb < new_wb) { spin_lock(&old_wb->list_lock); spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); } else { spin_lock(&new_wb->list_lock); spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); } for (inodep = isw->inodes; *inodep; inodep++) { WARN_ON_ONCE((*inodep)->i_wb != old_wb); if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) nr_switched++; } spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); up_read(&bdi->wb_switch_rwsem); if (nr_switched) { wb_wakeup(new_wb); wb_put_many(old_wb, nr_switched); } for (inodep = isw->inodes; *inodep; inodep++) iput(*inodep); wb_put(new_wb); kfree(isw); atomic_dec(&isw_nr_in_flight); } static bool inode_prepare_wbs_switch(struct inode *inode, struct bdi_writeback *new_wb) { /* * Paired with smp_mb() in cgroup_writeback_umount(). * isw_nr_in_flight must be increased before checking SB_ACTIVE and * grabbing an inode, otherwise isw_nr_in_flight can be observed as 0 * in cgroup_writeback_umount() and the isw_wq will be not flushed. */ smp_mb(); if (IS_DAX(inode)) return false; /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); if (!(inode->i_sb->s_flags & SB_ACTIVE) || inode->i_state & (I_WB_SWITCH | I_FREEING | I_WILL_FREE) || inode_to_wb(inode) == new_wb) { spin_unlock(&inode->i_lock); return false; } inode->i_state |= I_WB_SWITCH; __iget(inode); spin_unlock(&inode->i_lock); return true; } /** * inode_switch_wbs - change the wb association of an inode * @inode: target inode * @new_wb_id: ID of the new wb * * Switch @inode's wb association to the wb identified by @new_wb_id. The * switching is performed asynchronously and may fail silently. */ static void inode_switch_wbs(struct inode *inode, int new_wb_id) { struct backing_dev_info *bdi = inode_to_bdi(inode); struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; /* noop if seems to be already in progress */ if (inode->i_state & I_WB_SWITCH) return; /* avoid queueing a new switch if too many are already in flight */ if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) return; isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC); if (!isw) return; atomic_inc(&isw_nr_in_flight); /* find and pin the new wb */ rcu_read_lock(); memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); if (memcg_css && !css_tryget(memcg_css)) memcg_css = NULL; rcu_read_unlock(); if (!memcg_css) goto out_free; isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); css_put(memcg_css); if (!isw->new_wb) goto out_free; if (!inode_prepare_wbs_switch(inode, isw->new_wb)) goto out_free; isw->inodes[0] = inode; /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the i_page * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); queue_rcu_work(isw_wq, &isw->work); return; out_free: atomic_dec(&isw_nr_in_flight); if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); } static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, struct list_head *list, int *nr) { struct inode *inode; list_for_each_entry(inode, list, i_io_list) { if (!inode_prepare_wbs_switch(inode, isw->new_wb)) continue; isw->inodes[*nr] = inode; (*nr)++; if (*nr >= WB_MAX_INODES_PER_ISW - 1) return true; } return false; } /** * cleanup_offline_cgwb - detach associated inodes * @wb: target wb * * Switch all inodes attached to @wb to a nearest living ancestor's wb in order * to eventually release the dying @wb. Returns %true if not all inodes were * switched and the function has to be restarted. */ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; int nr; bool restart = false; isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW * sizeof(struct inode *), GFP_KERNEL); if (!isw) return restart; atomic_inc(&isw_nr_in_flight); for (memcg_css = wb->memcg_css->parent; memcg_css; memcg_css = memcg_css->parent) { isw->new_wb = wb_get_create(wb->bdi, memcg_css, GFP_KERNEL); if (isw->new_wb) break; } if (unlikely(!isw->new_wb)) isw->new_wb = &wb->bdi->wb; /* wb_get() is noop for bdi's wb */ nr = 0; spin_lock(&wb->list_lock); /* * In addition to the inodes that have completed writeback, also switch * cgwbs for those inodes only with dirty timestamps. Otherwise, those * inodes won't be written back for a long time when lazytime is * enabled, and thus pinning the dying cgwbs. It won't break the * bandwidth restrictions, as writeback of inode metadata is not * accounted for. */ restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); if (!restart) restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ if (nr == 0) { atomic_dec(&isw_nr_in_flight); wb_put(isw->new_wb); kfree(isw); return restart; } /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the i_page * lock so that stat transfer can synchronize against them. * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ INIT_RCU_WORK(&isw->work, inode_switch_wbs_work_fn); queue_rcu_work(isw_wq, &isw->work); return restart; } /** * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it * @wbc: writeback_control of interest * @inode: target inode * * @inode is locked and about to be written back under the control of @wbc. * Record @inode's writeback context into @wbc and unlock the i_lock. On * writeback completion, wbc_detach_inode() should be called. This is used * to track the cgroup writeback context. */ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) { if (!inode_cgwb_enabled(inode)) { spin_unlock(&inode->i_lock); return; } wbc->wb = inode_to_wb(inode); wbc->inode = inode; wbc->wb_id = wbc->wb->memcg_css->id; wbc->wb_lcand_id = inode->i_wb_frn_winner; wbc->wb_tcand_id = 0; wbc->wb_bytes = 0; wbc->wb_lcand_bytes = 0; wbc->wb_tcand_bytes = 0; wb_get(wbc->wb); spin_unlock(&inode->i_lock); /* * A dying wb indicates that either the blkcg associated with the * memcg changed or the associated memcg is dying. In the first * case, a replacement wb should already be available and we should * refresh the wb immediately. In the second case, trying to * refresh will keep failing. */ if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } EXPORT_SYMBOL_GPL(wbc_attach_and_unlock_inode); /** * wbc_detach_inode - disassociate wbc from inode and perform foreign detection * @wbc: writeback_control of the just finished writeback * * To be called after a writeback attempt of an inode finishes and undoes * wbc_attach_and_unlock_inode(). Can be called under any context. * * As concurrent write sharing of an inode is expected to be very rare and * memcg only tracks page ownership on first-use basis severely confining * the usefulness of such sharing, cgroup writeback tracks ownership * per-inode. While the support for concurrent write sharing of an inode * is deemed unnecessary, an inode being written to by different cgroups at * different points in time is a lot more common, and, more importantly, * charging only by first-use can too readily lead to grossly incorrect * behaviors (single foreign page can lead to gigabytes of writeback to be * incorrectly attributed). * * To resolve this issue, cgroup writeback detects the majority dirtier of * an inode and transfers the ownership to it. To avoid unnnecessary * oscillation, the detection mechanism keeps track of history and gives * out the switch verdict only if the foreign usage pattern is stable over * a certain amount of time and/or writeback attempts. * * On each writeback attempt, @wbc tries to detect the majority writer * using Boyer-Moore majority vote algorithm. In addition to the byte * count from the majority voting, it also counts the bytes written for the * current wb and the last round's winner wb (max of last round's current * wb, the winner from two rounds ago, and the last round's majority * candidate). Keeping track of the historical winner helps the algorithm * to semi-reliably detect the most active writer even when it's not the * absolute majority. * * Once the winner of the round is determined, whether the winner is * foreign or not and how much IO time the round consumed is recorded in * inode->i_wb_frn_history. If the amount of recorded foreign IO time is * over a certain threshold, the switch verdict is given. */ void wbc_detach_inode(struct writeback_control *wbc) { struct bdi_writeback *wb = wbc->wb; struct inode *inode = wbc->inode; unsigned long avg_time, max_bytes, max_time; u16 history; int max_id; if (!wb) return; history = inode->i_wb_frn_history; avg_time = inode->i_wb_frn_avg_time; /* pick the winner of this round */ if (wbc->wb_bytes >= wbc->wb_lcand_bytes && wbc->wb_bytes >= wbc->wb_tcand_bytes) { max_id = wbc->wb_id; max_bytes = wbc->wb_bytes; } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { max_id = wbc->wb_lcand_id; max_bytes = wbc->wb_lcand_bytes; } else { max_id = wbc->wb_tcand_id; max_bytes = wbc->wb_tcand_bytes; } /* * Calculate the amount of IO time the winner consumed and fold it * into the running average kept per inode. If the consumed IO * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for * deciding whether to switch or not. This is to prevent one-off * small dirtiers from skewing the verdict. */ max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, wb->avg_write_bandwidth); if (avg_time) avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - (avg_time >> WB_FRN_TIME_AVG_SHIFT); else avg_time = max_time; /* immediate catch up on first run */ if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { int slots; /* * The switch verdict is reached if foreign wb's consume * more than a certain proportion of IO time in a * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot * history mask where each bit represents one sixteenth of * the period. Determine the number of slots to shift into * history from @max_time. */ slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), (unsigned long)WB_FRN_HIST_MAX_SLOTS); history <<= slots; if (wbc->wb_id != max_id) history |= (1U << slots) - 1; if (history) trace_inode_foreign_history(inode, wbc, history); /* * Switch if the current wb isn't the consistent winner. * If there are multiple closely competing dirtiers, the * inode may switch across them repeatedly over time, which * is okay. The main goal is avoiding keeping an inode on * the wrong wb for an extended period of time. */ if (hweight16(history) > WB_FRN_HIST_THR_SLOTS) inode_switch_wbs(inode, max_id); } /* * Multiple instances of this function may race to update the * following fields but we don't mind occassional inaccuracies. */ inode->i_wb_frn_winner = max_id; inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); inode->i_wb_frn_history = history; wb_put(wbc->wb); wbc->wb = NULL; } EXPORT_SYMBOL_GPL(wbc_detach_inode); /** * wbc_account_cgroup_owner - account writeback to update inode cgroup ownership * @wbc: writeback_control of the writeback in progress * @page: page being written out * @bytes: number of bytes being written out * * @bytes from @page are about to written out during the writeback * controlled by @wbc. Keep the book for foreign inode detection. See * wbc_detach_inode(). */ void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes) { struct cgroup_subsys_state *css; int id; /* * pageout() path doesn't attach @wbc to the inode being written * out. This is intentional as we don't want the function to block * behind a slow cgroup. Ultimately, we want pageout() to kick off * regular writeback instead of writing things out itself. */ if (!wbc->wb || wbc->no_cgroup_owner) return; css = mem_cgroup_css_from_page(page); /* dead cgroups shouldn't contribute to inode ownership arbitration */ if (!(css->flags & CSS_ONLINE)) return; id = css->id; if (id == wbc->wb_id) { wbc->wb_bytes += bytes; return; } if (id == wbc->wb_lcand_id) wbc->wb_lcand_bytes += bytes; /* Boyer-Moore majority vote algorithm */ if (!wbc->wb_tcand_bytes) wbc->wb_tcand_id = id; if (id == wbc->wb_tcand_id) wbc->wb_tcand_bytes += bytes; else wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); } EXPORT_SYMBOL_GPL(wbc_account_cgroup_owner); /** * inode_congested - test whether an inode is congested * @inode: inode to test for congestion (may be NULL) * @cong_bits: mask of WB_[a]sync_congested bits to test * * Tests whether @inode is congested. @cong_bits is the mask of congestion * bits to test and the return value is the mask of set bits. * * If cgroup writeback is enabled for @inode, the congestion state is * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg * associated with @inode is congested; otherwise, the root wb's congestion * state is used. * * @inode is allowed to be NULL as this function is often called on * mapping->host which is NULL for the swapper space. */ int inode_congested(struct inode *inode, int cong_bits) { /* * Once set, ->i_wb never becomes NULL while the inode is alive. * Start transaction iff ->i_wb is visible. */ if (inode && inode_to_wb_is_valid(inode)) { struct bdi_writeback *wb; struct wb_lock_cookie lock_cookie = {}; bool congested; wb = unlocked_inode_to_wb_begin(inode, &lock_cookie); congested = wb_congested(wb, cong_bits); unlocked_inode_to_wb_end(inode, &lock_cookie); return congested; } return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); } EXPORT_SYMBOL_GPL(inode_congested); /** * wb_split_bdi_pages - split nr_pages to write according to bandwidth * @wb: target bdi_writeback to split @nr_pages to * @nr_pages: number of pages to write for the whole bdi * * Split @wb's portion of @nr_pages according to @wb's write bandwidth in * relation to the total write bandwidth of all wb's w/ dirty inodes on * @wb->bdi. */ static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) { unsigned long this_bw = wb->avg_write_bandwidth; unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); if (nr_pages == LONG_MAX) return LONG_MAX; /* * This may be called on clean wb's and proportional distribution * may not make sense, just use the original @nr_pages in those * cases. In general, we wanna err on the side of writing more. */ if (!tot_bw || this_bw >= tot_bw) return nr_pages; else return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); } /** * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi * @bdi: target backing_dev_info * @base_work: wb_writeback_work to issue * @skip_if_busy: skip wb's which already have writeback in progress * * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's * distributed to the busy wbs according to each wb's proportion in the * total active write bandwidth of @bdi. */ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { struct bdi_writeback *last_wb = NULL; struct bdi_writeback *wb = list_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); might_sleep(); restart: rcu_read_lock(); list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) { DEFINE_WB_COMPLETION(fallback_work_done, bdi); struct wb_writeback_work fallback_work; struct wb_writeback_work *work; long nr_pages; if (last_wb) { wb_put(last_wb); last_wb = NULL; } /* SYNC_ALL writes out I_DIRTY_TIME too */ if (!wb_has_dirty_io(wb) && (base_work->sync_mode == WB_SYNC_NONE || list_empty(&wb->b_dirty_time))) continue; if (skip_if_busy && writeback_in_progress(wb)) continue; nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages); work = kmalloc(sizeof(*work), GFP_ATOMIC); if (work) { *work = *base_work; work->nr_pages = nr_pages; work->auto_free = 1; wb_queue_work(wb, work); continue; } /* * If wb_tryget fails, the wb has been shutdown, skip it. * * Pin @wb so that it stays on @bdi->wb_list. This allows * continuing iteration from @wb after dropping and * regrabbing rcu read lock. */ if (!wb_tryget(wb)) continue; /* alloc failed, execute synchronously using on-stack fallback */ work = &fallback_work; *work = *base_work; work->nr_pages = nr_pages; work->auto_free = 0; work->done = &fallback_work_done; wb_queue_work(wb, work); last_wb = wb; rcu_read_unlock(); wb_wait_for_completion(&fallback_work_done); goto restart; } rcu_read_unlock(); if (last_wb) wb_put(last_wb); } /** * cgroup_writeback_by_id - initiate cgroup writeback from bdi and memcg IDs * @bdi_id: target bdi id * @memcg_id: target memcg css id * @reason: reason why some writeback work initiated * @done: target wb_completion * * Initiate flush of the bdi_writeback identified by @bdi_id and @memcg_id * with the specified parameters. */ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, enum wb_reason reason, struct wb_completion *done) { struct backing_dev_info *bdi; struct cgroup_subsys_state *memcg_css; struct bdi_writeback *wb; struct wb_writeback_work *work; unsigned long dirty; int ret; /* lookup bdi and memcg */ bdi = bdi_get_by_id(bdi_id); if (!bdi) return -ENOENT; rcu_read_lock(); memcg_css = css_from_id(memcg_id, &memory_cgrp_subsys); if (memcg_css && !css_tryget(memcg_css)) memcg_css = NULL; rcu_read_unlock(); if (!memcg_css) { ret = -ENOENT; goto out_bdi_put; } /* * And find the associated wb. If the wb isn't there already * there's nothing to flush, don't create one. */ wb = wb_get_lookup(bdi, memcg_css); if (!wb) { ret = -ENOENT; goto out_css_put; } /* * The caller is attempting to write out most of * the currently dirty pages. Let's take the current dirty page * count and inflate it by 25% which should be large enough to * flush out most dirty pages while avoiding getting livelocked by * concurrent dirtiers. * * BTW the memcg stats are flushed periodically and this is best-effort * estimation, so some potential error is ok. */ dirty = memcg_page_state(mem_cgroup_from_css(memcg_css), NR_FILE_DIRTY); dirty = dirty * 10 / 8; /* issue the writeback work */ work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); if (work) { work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; work->range_cyclic = 1; work->reason = reason; work->done = done; work->auto_free = 1; wb_queue_work(wb, work); ret = 0; } else { ret = -ENOMEM; } wb_put(wb); out_css_put: css_put(memcg_css); out_bdi_put: bdi_put(bdi); return ret; } /** * cgroup_writeback_umount - flush inode wb switches for umount * * This function is called when a super_block is about to be destroyed and * flushes in-flight inode wb switches. An inode wb switch goes through * RCU and then workqueue, so the two need to be flushed in order to ensure * that all previously scheduled switches are finished. As wb switches are * rare occurrences and synchronize_rcu() can take a while, perform * flushing iff wb switches are in flight. */ void cgroup_writeback_umount(void) { /* * SB_ACTIVE should be reliably cleared before checking * isw_nr_in_flight, see generic_shutdown_super(). */ smp_mb(); if (atomic_read(&isw_nr_in_flight)) { /* * Use rcu_barrier() to wait for all pending callbacks to * ensure that all in-flight wb switches are in the workqueue. */ rcu_barrier(); flush_workqueue(isw_wq); } } static int __init cgroup_writeback_init(void) { isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0); if (!isw_wq) return -ENOMEM; return 0; } fs_initcall(cgroup_writeback_init); #else /* CONFIG_CGROUP_WRITEBACK */ static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } static void inode_cgwb_move_to_attached(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); assert_spin_locked(&inode->i_lock); inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) __acquires(&wb->list_lock) { struct bdi_writeback *wb = inode_to_wb(inode); spin_unlock(&inode->i_lock); spin_lock(&wb->list_lock); return wb; } static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) __acquires(&wb->list_lock) { struct bdi_writeback *wb = inode_to_wb(inode); spin_lock(&wb->list_lock); return wb; } static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) { return nr_pages; } static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, struct wb_writeback_work *base_work, bool skip_if_busy) { might_sleep(); if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { base_work->auto_free = 0; wb_queue_work(&bdi->wb, base_work); } } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * Add in the number of potentially dirty inodes, because each inode * write can dirty pagecache in the underlying blockdev. */ static unsigned long get_nr_dirty_pages(void) { return global_node_page_state(NR_FILE_DIRTY) + get_nr_dirty_inodes(); } static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason) { if (!wb_has_dirty_io(wb)) return; /* * All callers of this function want to start writeback of all * dirty pages. Places like vmscan can call this at a very * high frequency, causing pointless allocations of tons of * work items and keeping the flusher threads busy retrieving * that work. Ensure that we only allow one of them pending and * inflight at the time. */ if (test_bit(WB_start_all, &wb->state) || test_and_set_bit(WB_start_all, &wb->state)) return; wb->start_all_reason = reason; wb_wakeup(wb); } /** * wb_start_background_writeback - start background writeback * @wb: bdi_writback to write from * * Description: * This makes sure WB_SYNC_NONE background writeback happens. When * this function returns, it is only guaranteed that for given wb * some IO is happening if we are over background dirty threshold. * Caller need not hold sb s_umount semaphore. */ void wb_start_background_writeback(struct bdi_writeback *wb) { /* * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ trace_writeback_wake_background(wb); wb_wakeup(wb); } /* * Remove the inode from the writeback list it is on. */ void inode_io_list_del(struct inode *inode) { struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); inode->i_state &= ~I_SYNC_QUEUED; list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); spin_unlock(&inode->i_lock); spin_unlock(&wb->list_lock); } EXPORT_SYMBOL(inode_io_list_del); /* * mark an inode as under writeback on the sb */ void sb_mark_inode_writeback(struct inode *inode) { struct super_block *sb = inode->i_sb; unsigned long flags; if (list_empty(&inode->i_wb_list)) { spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); if (list_empty(&inode->i_wb_list)) { list_add_tail(&inode->i_wb_list, &sb->s_inodes_wb); trace_sb_mark_inode_writeback(inode); } spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); } } /* * clear an inode as under writeback on the sb */ void sb_clear_inode_writeback(struct inode *inode) { struct super_block *sb = inode->i_sb; unsigned long flags; if (!list_empty(&inode->i_wb_list)) { spin_lock_irqsave(&sb->s_inode_wblist_lock, flags); if (!list_empty(&inode->i_wb_list)) { list_del_init(&inode->i_wb_list); trace_sb_clear_inode_writeback(inode); } spin_unlock_irqrestore(&sb->s_inode_wblist_lock, flags); } } /* * Redirty an inode: set its when-it-was dirtied timestamp and move it to the * furthest end of its superblock's dirty-inode list. * * Before stamping the inode's ->dirtied_when, we check to see whether it is * already the most-recently-dirtied inode on the b_dirty list. If that is * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ static void redirty_tail_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&inode->i_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; tail = wb_inode(wb->b_dirty.next); if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } inode_io_list_move_locked(inode, wb, &wb->b_dirty); inode->i_state &= ~I_SYNC_QUEUED; } static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { spin_lock(&inode->i_lock); redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); } /* * requeue inode for re-scanning after bdi->b_io list is exhausted. */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { inode_io_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) { inode->i_state &= ~I_SYNC; /* If inode is clean an unused, put it into LRU now... */ inode_add_lru(inode); /* Waiters must see I_SYNC cleared before being woken up */ smp_mb(); wake_up_bit(&inode->i_state, __I_SYNC); } static bool inode_dirtied_after(struct inode *inode, unsigned long t) { bool ret = time_after(inode->dirtied_when, t); #ifndef CONFIG_64BIT /* * For inodes being constantly redirtied, dirtied_when can get stuck. * It _appears_ to be in the future, but is actually in distant past. * This test is necessary to prevent such wrapped-around relative times * from permanently stopping the whole bdi writeback. */ ret = ret && time_before_eq(inode->dirtied_when, jiffies); #endif return ret; } #define EXPIRE_DIRTY_ATIME 0x0001 /* * Move expired (dirtied before dirtied_before) dirty inodes from * @delaying_queue to @dispatch_queue. */ static int move_expired_inodes(struct list_head *delaying_queue, struct list_head *dispatch_queue, unsigned long dirtied_before) { LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; struct inode *inode; int do_sb_sort = 0; int moved = 0; while (!list_empty(delaying_queue)) { inode = wb_inode(delaying_queue->prev); if (inode_dirtied_after(inode, dirtied_before)) break; spin_lock(&inode->i_lock); list_move(&inode->i_io_list, &tmp); moved++; inode->i_state |= I_SYNC_QUEUED; spin_unlock(&inode->i_lock); if (sb_is_blkdev_sb(inode->i_sb)) continue; if (sb && sb != inode->i_sb) do_sb_sort = 1; sb = inode->i_sb; } /* just one sb in list, splice to dispatch_queue and we're done */ if (!do_sb_sort) { list_splice(&tmp, dispatch_queue); goto out; } /* * Although inode's i_io_list is moved from 'tmp' to 'dispatch_queue', * we don't take inode->i_lock here because it is just a pointless overhead. * Inode is already marked as I_SYNC_QUEUED so writeback list handling is * fully under our control. */ while (!list_empty(&tmp)) { sb = wb_inode(tmp.prev)->i_sb; list_for_each_prev_safe(pos, node, &tmp) { inode = wb_inode(pos); if (inode->i_sb == sb) list_move(&inode->i_io_list, dispatch_queue); } } out: return moved; } /* * Queue all expired dirty inodes for io, eldest first. * Before * newly dirtied b_dirty b_io b_more_io * =============> gf edc BA * After * newly dirtied b_dirty b_io b_more_io * =============> g fBAedc * | * +--> dequeue for IO */ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work, unsigned long dirtied_before) { int moved; unsigned long time_expire_jif = dirtied_before; assert_spin_locked(&wb->list_lock); list_splice_init(&wb->b_more_io, &wb->b_io); moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, dirtied_before); if (!work->for_sync) time_expire_jif = jiffies - dirtytime_expire_interval * HZ; moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, time_expire_jif); if (moved) wb_io_lists_populated(wb); trace_writeback_queue_io(wb, work, dirtied_before, moved); } static int write_inode(struct inode *inode, struct writeback_control *wbc) { int ret; if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) { trace_writeback_write_inode_start(inode, wbc); ret = inode->i_sb->s_op->write_inode(inode, wbc); trace_writeback_write_inode(inode, wbc); return ret; } return 0; } /* * Wait for writeback on an inode to complete. Called with i_lock held. * Caller must make sure inode cannot go away when we drop i_lock. */ static void __inode_wait_for_writeback(struct inode *inode) __releases(inode->i_lock) __acquires(inode->i_lock) { DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC); wait_queue_head_t *wqh; wqh = bit_waitqueue(&inode->i_state, __I_SYNC); while (inode->i_state & I_SYNC) { spin_unlock(&inode->i_lock); __wait_on_bit(wqh, &wq, bit_wait, TASK_UNINTERRUPTIBLE); spin_lock(&inode->i_lock); } } /* * Wait for writeback on an inode to complete. Caller must have inode pinned. */ void inode_wait_for_writeback(struct inode *inode) { spin_lock(&inode->i_lock); __inode_wait_for_writeback(inode); spin_unlock(&inode->i_lock); } /* * Sleep until I_SYNC is cleared. This function must be called with i_lock * held and drops it. It is aimed for callers not holding any inode reference * so once i_lock is dropped, inode can go away. */ static void inode_sleep_on_writeback(struct inode *inode) __releases(inode->i_lock) { DEFINE_WAIT(wait); wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC); int sleep; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); sleep = inode->i_state & I_SYNC; spin_unlock(&inode->i_lock); if (sleep) schedule(); finish_wait(wqh, &wait); } /* * Find proper writeback list for the inode depending on its current state and * possibly also change of its state while we were doing writeback. Here we * handle things such as livelock prevention or fairness of writeback among * inodes. This function can be called only by flusher thread - noone else * processes all inodes in writeback lists and requeueing inodes behind flusher * thread's back can have unexpected consequences. */ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, struct writeback_control *wbc) { if (inode->i_state & I_FREEING) return; /* * Sync livelock prevention. Each inode is tagged and synced in one * shot. If still dirty, it will be redirty_tail()'ed below. Update * the dirty time to prevent enqueue and sync it again. */ if ((inode->i_state & I_DIRTY) && (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)) inode->dirtied_when = jiffies; if (wbc->pages_skipped) { /* * Writeback is not making progress due to locked buffers. * Skip this inode for now. Although having skipped pages * is odd for clean inodes, it can happen for some * filesystems so handle that gracefully. */ if (inode->i_state & I_DIRTY_ALL) redirty_tail_locked(inode, wb); else inode_cgwb_move_to_attached(inode, wb); return; } if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { /* * We didn't write back all the pages. nfs_writepages() * sometimes bales out without doing anything. */ if (wbc->nr_to_write <= 0) { /* Slice used up. Queue for next turn. */ requeue_io(inode, wb); } else { /* * Writeback blocked by something other than * congestion. Delay the inode for some time to * avoid spinning on the CPU (100% iowait) * retrying writeback of the dirty page/inode * that cannot be performed immediately. */ redirty_tail_locked(inode, wb); } } else if (inode->i_state & I_DIRTY) { /* * Filesystems can dirty the inode during writeback operations, * such as delayed allocation during submission or metadata * updates after data IO completion. */ redirty_tail_locked(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); inode->i_state &= ~I_SYNC_QUEUED; } else { /* The inode is clean. Remove from writeback lists. */ inode_cgwb_move_to_attached(inode, wb); } } /* * Write out an inode and its dirty pages (or some of its dirty pages, depending * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state. * * This doesn't remove the inode from the writeback list it is on, except * potentially to move it from b_dirty_time to b_dirty due to timestamp * expiration. The caller is otherwise responsible for writeback list handling. * * The caller is also responsible for setting the I_SYNC flag beforehand and * calling inode_sync_complete() to clear it afterwards. */ static int __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct address_space *mapping = inode->i_mapping; long nr_to_write = wbc->nr_to_write; unsigned dirty; int ret; WARN_ON(!(inode->i_state & I_SYNC)); trace_writeback_single_inode_start(inode, wbc, nr_to_write); ret = do_writepages(mapping, wbc); /* * Make sure to wait on the data before writing out the metadata. * This is important for filesystems that modify metadata on data * I/O completion. We don't do it for sync(2) writeback because it has a * separate, external IO completion path and ->sync_fs for guaranteeing * inode metadata is written back correctly. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) { int err = filemap_fdatawait(mapping); if (ret == 0) ret = err; } /* * If the inode has dirty timestamps and we need to write them, call * mark_inode_dirty_sync() to notify the filesystem about it and to * change I_DIRTY_TIME into I_DIRTY_SYNC. */ if ((inode->i_state & I_DIRTY_TIME) && (wbc->sync_mode == WB_SYNC_ALL || time_after(jiffies, inode->dirtied_time_when + dirtytime_expire_interval * HZ))) { trace_writeback_lazytime(inode); mark_inode_dirty_sync(inode); } /* * Get and clear the dirty flags from i_state. This needs to be done * after calling writepages because some filesystems may redirty the * inode during writepages due to delalloc. It also needs to be done * after handling timestamp expiration, as that may dirty the inode too. */ spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; inode->i_state &= ~dirty; /* * Paired with smp_mb() in __mark_inode_dirty(). This allows * __mark_inode_dirty() to test i_state without grabbing i_lock - * either they see the I_DIRTY bits cleared or we see the dirtied * inode. * * I_DIRTY_PAGES is always cleared together above even if @mapping * still has dirty pages. The flag is reinstated after smp_mb() if * necessary. This guarantees that either __mark_inode_dirty() * sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY. */ smp_mb(); if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) inode->i_state |= I_DIRTY_PAGES; spin_unlock(&inode->i_lock); /* Don't write the inode if only I_DIRTY_PAGES was set */ if (dirty & ~I_DIRTY_PAGES) { int err = write_inode(inode, wbc); if (ret == 0) ret = err; } trace_writeback_single_inode(inode, wbc, nr_to_write); return ret; } /* * Write out an inode's dirty data and metadata on-demand, i.e. separately from * the regular batched writeback done by the flusher threads in * writeback_sb_inodes(). @wbc controls various aspects of the write, such as * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE). * * To prevent the inode from going away, either the caller must have a reference * to the inode, or the inode must have I_WILL_FREE or I_FREEING set. */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) { struct bdi_writeback *wb; int ret = 0; spin_lock(&inode->i_lock); if (!atomic_read(&inode->i_count)) WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING))); else WARN_ON(inode->i_state & I_WILL_FREE); if (inode->i_state & I_SYNC) { /* * Writeback is already running on the inode. For WB_SYNC_NONE, * that's enough and we can just return. For WB_SYNC_ALL, we * must wait for the existing writeback to complete, then do * writeback again if there's anything left. */ if (wbc->sync_mode != WB_SYNC_ALL) goto out; __inode_wait_for_writeback(inode); } WARN_ON(inode->i_state & I_SYNC); /* * If the inode is already fully clean, then there's nothing to do. * * For data-integrity syncs we also need to check whether any pages are * still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If * there are any such pages, we'll need to wait for them. */ if (!(inode->i_state & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); wbc_detach_inode(wbc); wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* * If the inode is freeing, its i_io_list shoudn't be updated * as it can be finally deleted at this moment. */ if (!(inode->i_state & I_FREEING)) { /* * If the inode is now fully clean, then it can be safely * removed from its writeback list (if any). Otherwise the * flusher threads are responsible for the writeback lists. */ if (!(inode->i_state & I_DIRTY_ALL)) inode_cgwb_move_to_attached(inode, wb); else if (!(inode->i_state & I_SYNC_QUEUED)) { if ((inode->i_state & I_DIRTY)) redirty_tail_locked(inode, wb); else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); } } } spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: spin_unlock(&inode->i_lock); return ret; } static long writeback_chunk_size(struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; /* * WB_SYNC_ALL mode does livelock avoidance by syncing dirty * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX * here avoids calling into writeback_inodes_wb() more than once. * * The intended call sequence for WB_SYNC_ALL writeback is: * * wb_writeback() * writeback_sb_inodes() <== called only once * write_cache_pages() <== called once for each inode * (quickly) tag currently dirty pages * (maybe slowly) sync all tagged pages */ if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) pages = LONG_MAX; else { pages = min(wb->avg_write_bandwidth / 2, global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); pages = round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); } return pages; } /* * Write a portion of b_io inodes which belong to @sb. * * Return the number of pages and/or inodes written. * * NOTE! This is called with wb->list_lock held, and will * unlock and relock that for each inode it ends up doing * IO for. */ static long writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb, struct wb_writeback_work *work) { struct writeback_control wbc = { .sync_mode = work->sync_mode, .tagged_writepages = work->tagged_writepages, .for_kupdate = work->for_kupdate, .for_background = work->for_background, .for_sync = work->for_sync, .range_cyclic = work->range_cyclic, .range_start = 0, .range_end = LLONG_MAX, }; unsigned long start_time = jiffies; long write_chunk; long total_wrote = 0; /* count both pages and inodes */ while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct bdi_writeback *tmp_wb; long wrote; if (inode->i_sb != sb) { if (work->sb) { /* * We only want to write back data for this * superblock, move all inodes not belonging * to it back onto the dirty list. */ redirty_tail(inode, wb); continue; } /* * The inode belongs to a different superblock. * Bounce back to the caller to unpin this and * pin the next superblock. */ break; } /* * Don't bother with new inodes or inodes being freed, first * kind does not need periodic writeout yet, and for the latter * kind writeout is handled by the freer. */ spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { redirty_tail_locked(inode, wb); spin_unlock(&inode->i_lock); continue; } if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) { /* * If this inode is locked for writeback and we are not * doing writeback-for-data-integrity, move it to * b_more_io so that writeback can proceed with the * other inodes on s_io. * * We'll have another go at writing back this inode * when we completed a full scan of b_io. */ requeue_io(inode, wb); spin_unlock(&inode->i_lock); trace_writeback_sb_inodes_requeue(inode); continue; } spin_unlock(&wb->list_lock); /* * We already requeued the inode if it had I_SYNC set and we * are doing WB_SYNC_NONE writeback. So this catches only the * WB_SYNC_ALL case. */ if (inode->i_state & I_SYNC) { /* Wait for I_SYNC. This function drops i_lock... */ inode_sleep_on_writeback(inode); /* Inode may be gone, start again */ spin_lock(&wb->list_lock); continue; } inode->i_state |= I_SYNC; wbc_attach_and_unlock_inode(&wbc, inode); write_chunk = writeback_chunk_size(wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; /* * We use I_SYNC to pin the inode in memory. While it is set * evict_inode() will wait so the inode cannot be freed. */ __writeback_single_inode(inode, &wbc); wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped; wrote = wrote < 0 ? 0 : wrote; total_wrote += wrote; if (need_resched()) { /* * We're trying to balance between building up a nice * long list of IOs to improve our merge rate, and * getting those IOs out quickly for anyone throttling * in balance_dirty_pages(). cond_resched() doesn't * unplug, so get our IOs out the door before we * give up the CPU. */ blk_flush_plug(current); cond_resched(); } /* * Requeue @inode if still dirty. Be careful as @inode may * have been switched to another wb in the meantime. */ tmp_wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); if (!(inode->i_state & I_DIRTY_ALL)) total_wrote++; requeue_inode(inode, tmp_wb, &wbc); inode_sync_complete(inode); spin_unlock(&inode->i_lock); if (unlikely(tmp_wb != wb)) { spin_unlock(&tmp_wb->list_lock); spin_lock(&wb->list_lock); } /* * bail out to wb_writeback() often enough to check * background threshold and other termination conditions. */ if (total_wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } return total_wrote; } static long __writeback_inodes_wb(struct bdi_writeback *wb, struct wb_writeback_work *work) { unsigned long start_time = jiffies; long wrote = 0; while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; if (!trylock_super(sb)) { /* * trylock_super() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ redirty_tail(inode, wb); continue; } wrote += writeback_sb_inodes(sb, wb, work); up_read(&sb->s_umount); /* refer to the same tests at the end of writeback_sb_inodes */ if (wrote) { if (time_is_before_jiffies(start_time + HZ / 10UL)) break; if (work->nr_pages <= 0) break; } } /* Leave any unwritten inodes on b_io */ return wrote; } static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, enum wb_reason reason) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, .reason = reason, }; struct blk_plug plug; blk_start_plug(&plug); spin_lock(&wb->list_lock); if (list_empty(&wb->b_io)) queue_io(wb, &work, jiffies); __writeback_inodes_wb(wb, &work); spin_unlock(&wb->list_lock); blk_finish_plug(&plug); return nr_pages - work.nr_pages; } /* * Explicit flushing or periodic writeback of "old" data. * * Define "old": the first time one of an inode's pages is dirtied, we mark the * dirtying-time in the inode's address_space. So this periodic writeback code * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. * * Try to run once per dirty_writeback_interval. But if a writeback event * takes longer than a dirty_writeback_interval interval, then leave a * one-second gap. * * dirtied_before takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. */ static long wb_writeback(struct bdi_writeback *wb, struct wb_writeback_work *work) { long nr_pages = work->nr_pages; unsigned long dirtied_before = jiffies; struct inode *inode; long progress; struct blk_plug plug; blk_start_plug(&plug); spin_lock(&wb->list_lock); for (;;) { /* * Stop writeback when nr_pages has been consumed */ if (work->nr_pages <= 0) break; /* * Background writeout and kupdate-style writeback may * run forever. Stop them if there is other work to do * so that e.g. sync can proceed. They'll be restarted * after the other works are all done. */ if ((work->for_background || work->for_kupdate) && !list_empty(&wb->work_list)) break; /* * For background writeout, stop when we are below the * background dirty threshold */ if (work->for_background && !wb_over_bg_thresh(wb)) break; /* * Kupdate and background works are special and we want to * include all inodes that need writing. Livelock avoidance is * handled by these works yielding to any other work so we are * safe. */ if (work->for_kupdate) { dirtied_before = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); } else if (work->for_background) dirtied_before = jiffies; trace_writeback_start(wb, work); if (list_empty(&wb->b_io)) queue_io(wb, work, dirtied_before); if (work->sb) progress = writeback_sb_inodes(work->sb, wb, work); else progress = __writeback_inodes_wb(wb, work); trace_writeback_written(wb, work); /* * Did we write something? Try for more * * Dirty inodes are moved to b_io for writeback in batches. * The completion of the current batch does not necessarily * mean the overall work is done. So we keep looping as long * as made some progress on cleaning pages or inodes. */ if (progress) continue; /* * No more inodes for IO, bail */ if (list_empty(&wb->b_more_io)) break; /* * Nothing written. Wait for some inode to * become available for writeback. Otherwise * we'll just busyloop. */ trace_writeback_wait(wb, work); inode = wb_inode(wb->b_more_io.prev); spin_lock(&inode->i_lock); spin_unlock(&wb->list_lock); /* This function drops i_lock... */ inode_sleep_on_writeback(inode); spin_lock(&wb->list_lock); } spin_unlock(&wb->list_lock); blk_finish_plug(&plug); return nr_pages - work->nr_pages; } /* * Return the next wb_writeback_work struct that hasn't been processed yet. */ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) { struct wb_writeback_work *work = NULL; spin_lock_irq(&wb->work_lock); if (!list_empty(&wb->work_list)) { work = list_entry(wb->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } spin_unlock_irq(&wb->work_lock); return work; } static long wb_check_background_flush(struct bdi_writeback *wb) { if (wb_over_bg_thresh(wb)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, .sync_mode = WB_SYNC_NONE, .for_background = 1, .range_cyclic = 1, .reason = WB_REASON_BACKGROUND, }; return wb_writeback(wb, &work); } return 0; } static long wb_check_old_data_flush(struct bdi_writeback *wb) { unsigned long expired; long nr_pages; /* * When set to zero, disable periodic writeback */ if (!dirty_writeback_interval) return 0; expired = wb->last_old_flush + msecs_to_jiffies(dirty_writeback_interval * 10); if (time_before(jiffies, expired)) return 0; wb->last_old_flush = jiffies; nr_pages = get_nr_dirty_pages(); if (nr_pages) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .for_kupdate = 1, .range_cyclic = 1, .reason = WB_REASON_PERIODIC, }; return wb_writeback(wb, &work); } return 0; } static long wb_check_start_all(struct bdi_writeback *wb) { long nr_pages; if (!test_bit(WB_start_all, &wb->state)) return 0; nr_pages = get_nr_dirty_pages(); if (nr_pages) { struct wb_writeback_work work = { .nr_pages = wb_split_bdi_pages(wb, nr_pages), .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, .reason = wb->start_all_reason, }; nr_pages = wb_writeback(wb, &work); } clear_bit(WB_start_all, &wb->state); return nr_pages; } /* * Retrieve work items and do the writeback they describe */ static long wb_do_writeback(struct bdi_writeback *wb) { struct wb_writeback_work *work; long wrote = 0; set_bit(WB_writeback_running, &wb->state); while ((work = get_next_work_item(wb)) != NULL) { trace_writeback_exec(wb, work); wrote += wb_writeback(wb, work); finish_writeback_work(wb, work); } /* * Check for a flush-everything request */ wrote += wb_check_start_all(wb); /* * Check for periodic writeback, kupdated() style */ wrote += wb_check_old_data_flush(wb); wrote += wb_check_background_flush(wb); clear_bit(WB_writeback_running, &wb->state); return wrote; } /* * Handle writeback of dirty data for the device backed by this bdi. Also * reschedules periodically and does kupdated style flushing. */ void wb_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, dwork); long pages_written; set_worker_desc("flush-%s", bdi_dev_name(wb->bdi)); current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || !test_bit(WB_registered, &wb->state))) { /* * The normal path. Keep writing back @wb until its * work_list is empty. Note that this path is also taken * if @wb is shutting down even when we're running off the * rescuer as work_list needs to be drained. */ do { pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); } while (!list_empty(&wb->work_list)); } else { /* * bdi_wq can't get enough workers and we're running off * the emergency worker. Don't hog it. Hopefully, 1024 is * enough for efficient IO. */ pages_written = writeback_inodes_wb(wb, 1024, WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); } if (!list_empty(&wb->work_list)) wb_wakeup(wb); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) wb_wakeup_delayed(wb); current->flags &= ~PF_SWAPWRITE; } /* * Start writeback of `nr_pages' pages on this bdi. If `nr_pages' is zero, * write back the whole world. */ static void __wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason) { struct bdi_writeback *wb; if (!bdi_has_dirty_io(bdi)) return; list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) wb_start_writeback(wb, reason); } void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason) { rcu_read_lock(); __wakeup_flusher_threads_bdi(bdi, reason); rcu_read_unlock(); } /* * Wakeup the flusher threads to start writeback of all currently dirty pages */ void wakeup_flusher_threads(enum wb_reason reason) { struct backing_dev_info *bdi; /* * If we are expecting writeback progress we must submit plugged IO. */ if (blk_needs_flush_plug(current)) blk_schedule_flush_plug(current); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) __wakeup_flusher_threads_bdi(bdi, reason); rcu_read_unlock(); } /* * Wake up bdi's periodically to make sure dirtytime inodes gets * written back periodically. We deliberately do *not* check the * b_dirtytime list in wb_has_dirty_io(), since this would cause the * kernel to be constantly waking up once there are any dirtytime * inodes on the system. So instead we define a separate delayed work * function which gets called much more rarely. (By default, only * once every 12 hours.) * * If there is any other write activity going on in the file system, * this function won't be necessary. But if the only thing that has * happened on the file system is a dirtytime inode caused by an atime * update, we need this infrastructure below to make sure that inode * eventually gets pushed out to disk. */ static void wakeup_dirtytime_writeback(struct work_struct *w); static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback); static void wakeup_dirtytime_writeback(struct work_struct *w) { struct backing_dev_info *bdi; rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { struct bdi_writeback *wb; list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) if (!list_empty(&wb->b_dirty_time)) wb_wakeup(wb); } rcu_read_unlock(); schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); } static int __init start_dirtytime_writeback(void) { schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); return 0; } __initcall(start_dirtytime_writeback); int dirtytime_interval_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) mod_delayed_work(system_wq, &dirtytime_work, 0); return ret; } /** * __mark_inode_dirty - internal function to mark an inode dirty * * @inode: inode to mark * @flags: what kind of dirty, e.g. I_DIRTY_SYNC. This can be a combination of * multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined * with I_DIRTY_PAGES. * * Mark an inode as dirty. We notify the filesystem, then update the inode's * dirty flags. Then, if needed we add the inode to the appropriate dirty list. * * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync() * instead of calling this directly. * * CAREFUL! We only add the inode to the dirty list if it is hashed or if it * refers to a blockdev. Unhashed inodes will never be added to the dirty list * even if they are later hashed, as they will have been marked dirty already. * * In short, ensure you hash any inodes _before_ you start marking them dirty. * * Note that for blockdevs, inode->dirtied_when represents the dirtying time of * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of * the kernel-internal blockdev inode represents the dirtying time of the * blockdev's pages. This is why for I_DIRTY_PAGES we always use * page->mapping->host, so the page-dirtying time is recorded in the internal * blockdev inode. */ void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; int dirtytime = 0; struct bdi_writeback *wb = NULL; trace_writeback_mark_inode_dirty(inode, flags); if (flags & I_DIRTY_INODE) { /* * Inode timestamp update will piggback on this dirtying. * We tell ->dirty_inode callback that timestamps need to * be updated by setting I_DIRTY_TIME in flags. */ if (inode->i_state & I_DIRTY_TIME) { spin_lock(&inode->i_lock); if (inode->i_state & I_DIRTY_TIME) { inode->i_state &= ~I_DIRTY_TIME; flags |= I_DIRTY_TIME; } spin_unlock(&inode->i_lock); } /* * Notify the filesystem about the inode being dirtied, so that * (if needed) it can update on-disk fields and journal the * inode. This is only needed when the inode itself is being * dirtied now. I.e. it's only needed for I_DIRTY_INODE, not * for just I_DIRTY_PAGES or I_DIRTY_TIME. */ trace_writeback_dirty_inode_start(inode, flags); if (sb->s_op->dirty_inode) sb->s_op->dirty_inode(inode, flags & (I_DIRTY_INODE | I_DIRTY_TIME)); trace_writeback_dirty_inode(inode, flags); /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */ flags &= ~I_DIRTY_TIME; } else { /* * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing. * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME * in one call to __mark_inode_dirty().) */ dirtytime = flags & I_DIRTY_TIME; WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME); } /* * Paired with smp_mb() in __writeback_single_inode() for the * following lockless i_state test. See there for details. */ smp_mb(); if ((inode->i_state & flags) == flags) return; spin_lock(&inode->i_lock); if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; inode_attach_wb(inode, NULL); inode->i_state |= flags; /* * Grab inode's wb early because it requires dropping i_lock and we * need to make sure following checks happen atomically with dirty * list handling so that we don't move inodes under flush worker's * hands. */ if (!was_dirty) { wb = locked_inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); } /* * If the inode is queued for writeback by flush worker, just * update its dirty state. Once the flush worker is done with * the inode it will place it on the appropriate superblock * list, based upon its state. */ if (inode->i_state & I_SYNC_QUEUED) goto out_unlock; /* * Only add valid (hashed) inodes to the superblock's * dirty list. Add blockdev inodes as well. */ if (!S_ISBLK(inode->i_mode)) { if (inode_unhashed(inode)) goto out_unlock; } if (inode->i_state & I_FREEING) goto out_unlock; /* * If the inode was already on b_dirty/b_io/b_more_io, don't * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { struct list_head *dirty_list; bool wakeup_bdi = false; inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; if (inode->i_state & I_DIRTY) dirty_list = &wb->b_dirty; else dirty_list = &wb->b_dirty_time; wakeup_bdi = inode_io_list_move_locked(inode, wb, dirty_list); spin_unlock(&wb->list_lock); spin_unlock(&inode->i_lock); trace_writeback_dirty_inode_enqueue(inode); /* * If this is the first dirty inode for this bdi, * we have to wake-up the corresponding bdi thread * to make sure background write-back happens * later. */ if (wakeup_bdi && (wb->bdi->capabilities & BDI_CAP_WRITEBACK)) wb_wakeup_delayed(wb); return; } } out_unlock: if (wb) spin_unlock(&wb->list_lock); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(__mark_inode_dirty); /* * The @s_sync_lock is used to serialise concurrent sync operations * to avoid lock contention problems with concurrent wait_sb_inodes() calls. * Concurrent callers will block on the s_sync_lock rather than doing contending * walks. The queueing maintains sync(2) required behaviour as all the IO that * has been issued up to the time this function is enter is guaranteed to be * completed by the time we have gained the lock and waited for all IO that is * in progress regardless of the order callers are granted the lock. */ static void wait_sb_inodes(struct super_block *sb) { LIST_HEAD(sync_list); /* * We need to be protected against the filesystem going from * r/o to r/w or vice versa. */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); mutex_lock(&sb->s_sync_lock); /* * Splice the writeback list onto a temporary list to avoid waiting on * inodes that have started writeback after this point. * * Use rcu_read_lock() to keep the inodes around until we have a * reference. s_inode_wblist_lock protects sb->s_inodes_wb as well as * the local list because inodes can be dropped from either by writeback * completion. */ rcu_read_lock(); spin_lock_irq(&sb->s_inode_wblist_lock); list_splice_init(&sb->s_inodes_wb, &sync_list); /* * Data integrity sync. Must wait for all pages under writeback, because * there may have been pages dirtied before our sync call, but which had * writeout started before we write it out. In which case, the inode * may not be on the dirty list, but we still have to wait for that * writeout. */ while (!list_empty(&sync_list)) { struct inode *inode = list_first_entry(&sync_list, struct inode, i_wb_list); struct address_space *mapping = inode->i_mapping; /* * Move each inode back to the wb list before we drop the lock * to preserve consistency between i_wb_list and the mapping * writeback tag. Writeback completion is responsible to remove * the inode from either list once the writeback tag is cleared. */ list_move_tail(&inode->i_wb_list, &sb->s_inodes_wb); /* * The mapping can appear untagged while still on-list since we * do not have the mapping lock. Skip it here, wb completion * will remove it. */ if (!mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) continue; spin_unlock_irq(&sb->s_inode_wblist_lock); spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { spin_unlock(&inode->i_lock); spin_lock_irq(&sb->s_inode_wblist_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); rcu_read_unlock(); /* * We keep the error status of individual mapping so that * applications can catch the writeback error using fsync(2). * See filemap_fdatawait_keep_errors() for details. */ filemap_fdatawait_keep_errors(mapping); cond_resched(); iput(inode); rcu_read_lock(); spin_lock_irq(&sb->s_inode_wblist_lock); } spin_unlock_irq(&sb->s_inode_wblist_lock); rcu_read_unlock(); mutex_unlock(&sb->s_sync_lock); } static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason, bool skip_if_busy) { struct backing_dev_info *bdi = sb->s_bdi; DEFINE_WB_COMPLETION(done, bdi); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_NONE, .tagged_writepages = 1, .done = &done, .nr_pages = nr, .reason = reason, }; if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); wb_wait_for_completion(&done); } /** * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock * @nr: the number of pages to write * @reason: reason why some writeback work initiated * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason) { __writeback_inodes_sb_nr(sb, nr, reason, false); } EXPORT_SYMBOL(writeback_inodes_sb_nr); /** * writeback_inodes_sb - writeback dirty inodes from given super_block * @sb: the superblock * @reason: reason why some writeback work was initiated * * Start writeback on some inodes on this super_block. No guarantees are made * on how many (if any) will be written, and this function does not wait * for IO completion of submitted IO. */ void writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { return writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } EXPORT_SYMBOL(writeback_inodes_sb); /** * try_to_writeback_inodes_sb - try to start writeback if none underway * @sb: the superblock * @reason: reason why some writeback work was initiated * * Invoke __writeback_inodes_sb_nr if no writeback is currently underway. */ void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { if (!down_read_trylock(&sb->s_umount)) return; __writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason, true); up_read(&sb->s_umount); } EXPORT_SYMBOL(try_to_writeback_inodes_sb); /** * sync_inodes_sb - sync sb inode pages * @sb: the superblock * * This function writes and waits on any dirty inode belonging to this * super_block. */ void sync_inodes_sb(struct super_block *sb) { struct backing_dev_info *bdi = sb->s_bdi; DEFINE_WB_COMPLETION(done, bdi); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, .nr_pages = LONG_MAX, .range_cyclic = 0, .done = &done, .reason = WB_REASON_SYNC, .for_sync = 1, }; /* * Can't skip on !bdi_has_dirty() because we should wait for !dirty * inodes under writeback and I_DIRTY_TIME inodes ignored by * bdi_has_dirty() need to be written out too. */ if (bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ bdi_down_write_wb_switch_rwsem(bdi); bdi_split_work_to_wbs(bdi, &work, false); wb_wait_for_completion(&done); bdi_up_write_wb_switch_rwsem(bdi); wait_sb_inodes(sb); } EXPORT_SYMBOL(sync_inodes_sb); /** * write_inode_now - write an inode to disk * @inode: inode to write to disk * @sync: whether the write should be synchronous or not * * This function commits an inode to disk immediately if it is dirty. This is * primarily needed by knfsd. * * The caller must either have a ref on the inode or must have set I_WILL_FREE. */ int write_inode_now(struct inode *inode, int sync) { struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; if (!mapping_can_writeback(inode->i_mapping)) wbc.nr_to_write = 0; might_sleep(); return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(write_inode_now); /** * sync_inode_metadata - write an inode to disk * @inode: the inode to sync * @wait: wait for I/O to complete. * * Write an inode to disk and adjust its dirty state after completion. * * Note: only writes the actual inode, no associated data or other metadata. */ int sync_inode_metadata(struct inode *inode, int wait) { struct writeback_control wbc = { .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, .nr_to_write = 0, /* metadata-only */ }; return writeback_single_inode(inode, &wbc); } EXPORT_SYMBOL(sync_inode_metadata);
6 33 1 16 30 30 29 5 28 28 2 1 33 18 36 36 36 35 16 16 16 16 15 16 16 18 35 1 34 16 2 20 11 15 12 20 15 16 16 15 16 15 16 16 16 11 11 20 20 20 20 20 20 20 14 10 11 5 3 1 8 24 30 29 29 30 30 27 51 17 36 21 1 20 1 2 18 18 1 12 5 13 18 16 2 18 12 16 2 18 11 11 9 9 11 10 1 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 // SPDX-License-Identifier: GPL-2.0-or-later /* * Userspace interface * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/netpoll.h> #include <linux/ethtool.h> #include <linux/if_arp.h> #include <linux/module.h> #include <linux/init.h> #include <linux/rtnetlink.h> #include <linux/if_ether.h> #include <linux/slab.h> #include <net/dsa.h> #include <net/sock.h> #include <linux/if_vlan.h> #include <net/switchdev.h> #include <net/net_namespace.h> #include "br_private.h" /* * Determine initial path cost based on speed. * using recommendations from 802.1d standard * * Since driver might sleep need to not be holding any locks. */ static int port_cost(struct net_device *dev) { struct ethtool_link_ksettings ecmd; if (!__ethtool_get_link_ksettings(dev, &ecmd)) { switch (ecmd.base.speed) { case SPEED_10000: return 2; case SPEED_1000: return 4; case SPEED_100: return 19; case SPEED_10: return 100; } } /* Old silly heuristics based on name */ if (!strncmp(dev->name, "lec", 3)) return 7; if (!strncmp(dev->name, "plip", 4)) return 2500; return 100; /* assume old 10Mbps */ } /* Check for port carrier transitions. */ void br_port_carrier_check(struct net_bridge_port *p, bool *notified) { struct net_device *dev = p->dev; struct net_bridge *br = p->br; if (!(p->flags & BR_ADMIN_COST) && netif_running(dev) && netif_oper_up(dev)) p->path_cost = port_cost(dev); *notified = false; if (!netif_running(br->dev)) return; spin_lock_bh(&br->lock); if (netif_running(dev) && netif_oper_up(dev)) { if (p->state == BR_STATE_DISABLED) { br_stp_enable_port(p); *notified = true; } } else { if (p->state != BR_STATE_DISABLED) { br_stp_disable_port(p); *notified = true; } } spin_unlock_bh(&br->lock); } static void br_port_set_promisc(struct net_bridge_port *p) { int err = 0; if (br_promisc_port(p)) return; err = dev_set_promiscuity(p->dev, 1); if (err) return; br_fdb_unsync_static(p->br, p); p->flags |= BR_PROMISC; } static void br_port_clear_promisc(struct net_bridge_port *p) { int err; /* Check if the port is already non-promisc or if it doesn't * support UNICAST filtering. Without unicast filtering support * we'll end up re-enabling promisc mode anyway, so just check for * it here. */ if (!br_promisc_port(p) || !(p->dev->priv_flags & IFF_UNICAST_FLT)) return; /* Since we'll be clearing the promisc mode, program the port * first so that we don't have interruption in traffic. */ err = br_fdb_sync_static(p->br, p); if (err) return; dev_set_promiscuity(p->dev, -1); p->flags &= ~BR_PROMISC; } /* When a port is added or removed or when certain port flags * change, this function is called to automatically manage * promiscuity setting of all the bridge ports. We are always called * under RTNL so can skip using rcu primitives. */ void br_manage_promisc(struct net_bridge *br) { struct net_bridge_port *p; bool set_all = false; /* If vlan filtering is disabled or bridge interface is placed * into promiscuous mode, place all ports in promiscuous mode. */ if ((br->dev->flags & IFF_PROMISC) || !br_vlan_enabled(br->dev)) set_all = true; list_for_each_entry(p, &br->port_list, list) { if (set_all) { br_port_set_promisc(p); } else { /* If the number of auto-ports is <= 1, then all other * ports will have their output configuration * statically specified through fdbs. Since ingress * on the auto-port becomes forwarding/egress to other * ports and egress configuration is statically known, * we can say that ingress configuration of the * auto-port is also statically known. * This lets us disable promiscuous mode and write * this config to hw. */ if ((p->dev->priv_flags & IFF_UNICAST_FLT) && (br->auto_cnt == 0 || (br->auto_cnt == 1 && br_auto_port(p)))) br_port_clear_promisc(p); else br_port_set_promisc(p); } } } int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev) { struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port); struct net_bridge_port *backup_p = NULL; ASSERT_RTNL(); if (backup_dev) { if (!netif_is_bridge_port(backup_dev)) return -ENOENT; backup_p = br_port_get_rtnl(backup_dev); if (backup_p->br != p->br) return -EINVAL; } if (p == backup_p) return -EINVAL; if (old_backup == backup_p) return 0; /* if the backup link is already set, clear it */ if (old_backup) old_backup->backup_redirected_cnt--; if (backup_p) backup_p->backup_redirected_cnt++; rcu_assign_pointer(p->backup_port, backup_p); return 0; } static void nbp_backup_clear(struct net_bridge_port *p) { nbp_backup_change(p, NULL); if (p->backup_redirected_cnt) { struct net_bridge_port *cur_p; list_for_each_entry(cur_p, &p->br->port_list, list) { struct net_bridge_port *backup_p; backup_p = rtnl_dereference(cur_p->backup_port); if (backup_p == p) nbp_backup_change(cur_p, NULL); } } WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt); } static void nbp_update_port_count(struct net_bridge *br) { struct net_bridge_port *p; u32 cnt = 0; list_for_each_entry(p, &br->port_list, list) { if (br_auto_port(p)) cnt++; } if (br->auto_cnt != cnt) { br->auto_cnt = cnt; br_manage_promisc(br); } } static void nbp_delete_promisc(struct net_bridge_port *p) { /* If port is currently promiscuous, unset promiscuity. * Otherwise, it is a static port so remove all addresses * from it. */ dev_set_allmulti(p->dev, -1); if (br_promisc_port(p)) dev_set_promiscuity(p->dev, -1); else br_fdb_unsync_static(p->br, p); } static void release_nbp(struct kobject *kobj) { struct net_bridge_port *p = container_of(kobj, struct net_bridge_port, kobj); kfree(p); } static void brport_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) { struct net_bridge_port *p = kobj_to_brport(kobj); net_ns_get_ownership(dev_net(p->dev), uid, gid); } static struct kobj_type brport_ktype = { #ifdef CONFIG_SYSFS .sysfs_ops = &brport_sysfs_ops, #endif .release = release_nbp, .get_ownership = brport_get_ownership, }; static void destroy_nbp(struct net_bridge_port *p) { struct net_device *dev = p->dev; p->br = NULL; p->dev = NULL; dev_put(dev); kobject_put(&p->kobj); } static void destroy_nbp_rcu(struct rcu_head *head) { struct net_bridge_port *p = container_of(head, struct net_bridge_port, rcu); destroy_nbp(p); } static unsigned get_max_headroom(struct net_bridge *br) { unsigned max_headroom = 0; struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { unsigned dev_headroom = netdev_get_fwd_headroom(p->dev); if (dev_headroom > max_headroom) max_headroom = dev_headroom; } return max_headroom; } static void update_headroom(struct net_bridge *br, int new_hr) { struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) netdev_set_rx_headroom(p->dev, new_hr); br->dev->needed_headroom = new_hr; } /* Delete port(interface) from bridge is done in two steps. * via RCU. First step, marks device as down. That deletes * all the timers and stops new packets from flowing through. * * Final cleanup doesn't occur until after all CPU's finished * processing packets. * * Protected from multiple admin operations by RTNL mutex */ static void del_nbp(struct net_bridge_port *p) { struct net_bridge *br = p->br; struct net_device *dev = p->dev; sysfs_remove_link(br->ifobj, p->dev->name); nbp_delete_promisc(p); spin_lock_bh(&br->lock); br_stp_disable_port(p); spin_unlock_bh(&br->lock); br_mrp_port_del(br, p); br_cfm_port_del(br, p); br_ifinfo_notify(RTM_DELLINK, NULL, p); list_del_rcu(&p->list); if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom) update_headroom(br, get_max_headroom(br)); netdev_reset_rx_headroom(dev); nbp_vlan_flush(p); br_fdb_delete_by_port(br, p, 0, 1); switchdev_deferred_process(); nbp_backup_clear(p); nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); br_multicast_del_port(p); kobject_uevent(&p->kobj, KOBJ_REMOVE); kobject_del(&p->kobj); br_netpoll_disable(p); call_rcu(&p->rcu, destroy_nbp_rcu); } /* Delete bridge device */ void br_dev_delete(struct net_device *dev, struct list_head *head) { struct net_bridge *br = netdev_priv(dev); struct net_bridge_port *p, *n; list_for_each_entry_safe(p, n, &br->port_list, list) { del_nbp(p); } br_recalculate_neigh_suppress_enabled(br); br_fdb_delete_by_port(br, NULL, 0, 1); cancel_delayed_work_sync(&br->gc_work); br_sysfs_delbr(br->dev); unregister_netdevice_queue(br->dev, head); } /* find an available port number */ static int find_portno(struct net_bridge *br) { int index; struct net_bridge_port *p; unsigned long *inuse; inuse = bitmap_zalloc(BR_MAX_PORTS, GFP_KERNEL); if (!inuse) return -ENOMEM; set_bit(0, inuse); /* zero is reserved */ list_for_each_entry(p, &br->port_list, list) { set_bit(p->port_no, inuse); } index = find_first_zero_bit(inuse, BR_MAX_PORTS); bitmap_free(inuse); return (index >= BR_MAX_PORTS) ? -EXFULL : index; } /* called with RTNL but without bridge lock */ static struct net_bridge_port *new_nbp(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; int index, err; index = find_portno(br); if (index < 0) return ERR_PTR(index); p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return ERR_PTR(-ENOMEM); p->br = br; dev_hold(dev); p->dev = dev; p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; br_init_port(p); br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); err = br_multicast_add_port(p); if (err) { dev_put(dev); kfree(p); p = ERR_PTR(err); } return p; } int br_add_bridge(struct net *net, const char *name) { struct net_device *dev; int res; dev = alloc_netdev(sizeof(struct net_bridge), name, NET_NAME_UNKNOWN, br_dev_setup); if (!dev) return -ENOMEM; dev_net_set(dev, net); dev->rtnl_link_ops = &br_link_ops; res = register_netdevice(dev); if (res) free_netdev(dev); return res; } int br_del_bridge(struct net *net, const char *name) { struct net_device *dev; int ret = 0; dev = __dev_get_by_name(net, name); if (dev == NULL) ret = -ENXIO; /* Could not find device */ else if (!(dev->priv_flags & IFF_EBRIDGE)) { /* Attempt to delete non bridge device! */ ret = -EPERM; } else if (dev->flags & IFF_UP) { /* Not shutdown yet. */ ret = -EBUSY; } else br_dev_delete(dev, NULL); return ret; } /* MTU of the bridge pseudo-device: ETH_DATA_LEN or the minimum of the ports */ static int br_mtu_min(const struct net_bridge *br) { const struct net_bridge_port *p; int ret_mtu = 0; list_for_each_entry(p, &br->port_list, list) if (!ret_mtu || ret_mtu > p->dev->mtu) ret_mtu = p->dev->mtu; return ret_mtu ? ret_mtu : ETH_DATA_LEN; } void br_mtu_auto_adjust(struct net_bridge *br) { ASSERT_RTNL(); /* if the bridge MTU was manually configured don't mess with it */ if (br_opt_get(br, BROPT_MTU_SET_BY_USER)) return; /* change to the minimum MTU and clear the flag which was set by * the bridge ndo_change_mtu callback */ dev_set_mtu(br->dev, br_mtu_min(br)); br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false); } static void br_set_gso_limits(struct net_bridge *br) { unsigned int gso_max_size = GSO_MAX_SIZE; u16 gso_max_segs = GSO_MAX_SEGS; const struct net_bridge_port *p; list_for_each_entry(p, &br->port_list, list) { gso_max_size = min(gso_max_size, p->dev->gso_max_size); gso_max_segs = min(gso_max_segs, p->dev->gso_max_segs); } br->dev->gso_max_size = gso_max_size; br->dev->gso_max_segs = gso_max_segs; } /* * Recomputes features using slave's features */ netdev_features_t br_features_recompute(struct net_bridge *br, netdev_features_t features) { struct net_bridge_port *p; netdev_features_t mask; if (list_empty(&br->port_list)) return features; mask = features; features &= ~NETIF_F_ONE_FOR_ALL; list_for_each_entry(p, &br->port_list, list) { features = netdev_increment_features(features, p->dev->features, mask); } features = netdev_add_tso_features(features, mask); return features; } /* called with RTNL */ int br_add_if(struct net_bridge *br, struct net_device *dev, struct netlink_ext_ack *extack) { struct net_bridge_port *p; int err = 0; unsigned br_hr, dev_hr; bool changed_addr, fdb_synced = false; /* Don't allow bridging non-ethernet like devices. */ if ((dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) return -EINVAL; /* Also don't allow bridging of net devices that are DSA masters, since * the bridge layer rx_handler prevents the DSA fake ethertype handler * to be invoked, so we don't get the chance to strip off and parse the * DSA switch tag protocol header (the bridge layer just returns * RX_HANDLER_CONSUMED, stopping RX processing for these frames). * The only case where that would not be an issue is when bridging can * already be offloaded, such as when the DSA master is itself a DSA * or plain switchdev port, and is bridged only with other ports from * the same hardware device. */ if (netdev_uses_dsa(dev)) { list_for_each_entry(p, &br->port_list, list) { if (!netdev_port_same_parent_id(dev, p->dev)) { NL_SET_ERR_MSG(extack, "Cannot do software bridging with a DSA master"); return -EINVAL; } } } /* No bridging of bridges */ if (dev->netdev_ops->ndo_start_xmit == br_dev_xmit) { NL_SET_ERR_MSG(extack, "Can not enslave a bridge to a bridge"); return -ELOOP; } /* Device has master upper dev */ if (netdev_master_upper_dev_get(dev)) return -EBUSY; /* No bridging devices that dislike that (e.g. wireless) */ if (dev->priv_flags & IFF_DONT_BRIDGE) { NL_SET_ERR_MSG(extack, "Device does not allow enslaving to a bridge"); return -EOPNOTSUPP; } p = new_nbp(br, dev); if (IS_ERR(p)) return PTR_ERR(p); call_netdevice_notifiers(NETDEV_JOIN, dev); err = dev_set_allmulti(dev, 1); if (err) { br_multicast_del_port(p); kfree(p); /* kobject not yet init'd, manually free */ goto err1; } err = kobject_init_and_add(&p->kobj, &brport_ktype, &(dev->dev.kobj), SYSFS_BRIDGE_PORT_ATTR); if (err) goto err2; err = br_sysfs_addif(p); if (err) goto err2; err = br_netpoll_enable(p); if (err) goto err3; err = netdev_rx_handler_register(dev, br_get_rx_handler(dev), p); if (err) goto err4; dev->priv_flags |= IFF_BRIDGE_PORT; err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL, extack); if (err) goto err5; dev_disable_lro(dev); list_add_rcu(&p->list, &br->port_list); nbp_update_port_count(br); if (!br_promisc_port(p) && (p->dev->priv_flags & IFF_UNICAST_FLT)) { /* When updating the port count we also update all ports' * promiscuous mode. * A port leaving promiscuous mode normally gets the bridge's * fdb synced to the unicast filter (if supported), however, * `br_port_clear_promisc` does not distinguish between * non-promiscuous ports and *new* ports, so we need to * sync explicitly here. */ fdb_synced = br_fdb_sync_static(br, p) == 0; if (!fdb_synced) netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n"); } netdev_update_features(br->dev); br_hr = br->dev->needed_headroom; dev_hr = netdev_get_fwd_headroom(dev); if (br_hr < dev_hr) update_headroom(br, dev_hr); else netdev_set_rx_headroom(dev, br_hr); if (br_fdb_insert(br, p, dev->dev_addr, 0)) netdev_err(dev, "failed insert local address bridge forwarding table\n"); if (br->dev->addr_assign_type != NET_ADDR_SET) { /* Ask for permission to use this MAC address now, even if we * don't end up choosing it below. */ err = dev_pre_changeaddr_notify(br->dev, dev->dev_addr, extack); if (err) goto err6; } err = nbp_vlan_init(p, extack); if (err) { netdev_err(dev, "failed to initialize vlan filtering on this port\n"); goto err6; } spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); if (netif_running(dev) && netif_oper_up(dev) && (br->dev->flags & IFF_UP)) br_stp_enable_port(p); spin_unlock_bh(&br->lock); br_ifinfo_notify(RTM_NEWLINK, NULL, p); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); br_mtu_auto_adjust(br); br_set_gso_limits(br); kobject_uevent(&p->kobj, KOBJ_ADD); return 0; err6: if (fdb_synced) br_fdb_unsync_static(br, p); list_del_rcu(&p->list); br_fdb_delete_by_port(br, p, 0, 1); nbp_update_port_count(br); netdev_upper_dev_unlink(dev, br->dev); err5: dev->priv_flags &= ~IFF_BRIDGE_PORT; netdev_rx_handler_unregister(dev); err4: br_netpoll_disable(p); err3: sysfs_remove_link(br->ifobj, p->dev->name); err2: br_multicast_del_port(p); kobject_put(&p->kobj); dev_set_allmulti(dev, -1); err1: dev_put(dev); return err; } /* called with RTNL */ int br_del_if(struct net_bridge *br, struct net_device *dev) { struct net_bridge_port *p; bool changed_addr; p = br_port_get_rtnl(dev); if (!p || p->br != br) return -EINVAL; /* Since more than one interface can be attached to a bridge, * there still maybe an alternate path for netconsole to use; * therefore there is no reason for a NETDEV_RELEASE event. */ del_nbp(p); br_mtu_auto_adjust(br); br_set_gso_limits(br); spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); spin_unlock_bh(&br->lock); if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); netdev_update_features(br->dev); return 0; } void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) { struct net_bridge *br = p->br; if (mask & BR_AUTO_MASK) nbp_update_port_count(br); if (mask & BR_NEIGH_SUPPRESS) br_recalculate_neigh_suppress_enabled(br); } bool br_port_flag_is_set(const struct net_device *dev, unsigned long flag) { struct net_bridge_port *p; p = br_port_get_rtnl_rcu(dev); if (!p) return false; return p->flags & flag; } EXPORT_SYMBOL_GPL(br_port_flag_is_set);
251 77 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* delayacct.h - per-task delay accounting * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 */ #ifndef _LINUX_DELAYACCT_H #define _LINUX_DELAYACCT_H #include <uapi/linux/taskstats.h> /* * Per-task flags relevant to delay accounting * maintained privately to avoid exhausting similar flags in sched.h:PF_* * Used to set current->delays->flags */ #define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */ #define DELAYACCT_PF_BLKIO 0x00000002 /* I am waiting on IO */ #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info { raw_spinlock_t lock; unsigned int flags; /* Private per-task flags */ /* For each stat XXX, add following, aligned appropriately * * struct timespec XXX_start, XXX_end; * u64 XXX_delay; * u32 XXX_count; * * Atomicity of updates to XXX_delay, XXX_count protected by * single lock above (split into XXX_lock if contention is an issue). */ /* * XXX_count is incremented on every XXX operation, the delay * associated with the operation is added to XXX_delay. * XXX_delay contains the accumulated delay time in nanoseconds. */ u64 blkio_start; /* Shared by blkio, swapin */ u64 blkio_delay; /* wait for sync block io completion */ u64 swapin_delay; /* wait for swapin block io completion */ u32 blkio_count; /* total count of the number of sync block */ /* io operations performed */ u32 swapin_count; /* total count of the number of swapin block */ /* io operations performed */ u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */ u64 thrashing_start; u64 thrashing_delay; /* wait for thrashing page */ u32 freepages_count; /* total count of memory reclaim */ u32 thrashing_count; /* total count of thrash waits */ }; #endif #include <linux/sched.h> #include <linux/slab.h> #include <linux/jump_label.h> #ifdef CONFIG_TASK_DELAY_ACCT DECLARE_STATIC_KEY_FALSE(delayacct_key); extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); extern int sysctl_delayacct(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); extern void __delayacct_blkio_end(struct task_struct *); extern int delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); extern void __delayacct_thrashing_start(void); extern void __delayacct_thrashing_end(void); static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { if (p->delays) return (p->delays->flags & DELAYACCT_PF_BLKIO); else return 0; } static inline void delayacct_set_flag(struct task_struct *p, int flag) { if (p->delays) p->delays->flags |= flag; } static inline void delayacct_clear_flag(struct task_struct *p, int flag) { if (p->delays) p->delays->flags &= ~flag; } static inline void delayacct_tsk_init(struct task_struct *tsk) { /* reinitialize in case parent's non-null pointer was dup'ed*/ tsk->delays = NULL; if (delayacct_on) __delayacct_tsk_init(tsk); } /* Free tsk->delays. Called from bad fork and __put_task_struct * where there's no risk of tsk->delays being accessed elsewhere */ static inline void delayacct_tsk_free(struct task_struct *tsk) { if (tsk->delays) kmem_cache_free(delayacct_cache, tsk->delays); tsk->delays = NULL; } static inline void delayacct_blkio_start(void) { if (!static_branch_unlikely(&delayacct_key)) return; delayacct_set_flag(current, DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); } static inline void delayacct_blkio_end(struct task_struct *p) { if (!static_branch_unlikely(&delayacct_key)) return; if (p->delays) __delayacct_blkio_end(p); delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); } static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) { if (tsk->delays) return __delayacct_blkio_ticks(tsk); return 0; } static inline void delayacct_freepages_start(void) { if (current->delays) __delayacct_freepages_start(); } static inline void delayacct_freepages_end(void) { if (current->delays) __delayacct_freepages_end(); } static inline void delayacct_thrashing_start(void) { if (current->delays) __delayacct_thrashing_start(); } static inline void delayacct_thrashing_end(void) { if (current->delays) __delayacct_thrashing_end(); } #else static inline void delayacct_set_flag(struct task_struct *p, int flag) {} static inline void delayacct_clear_flag(struct task_struct *p, int flag) {} static inline void delayacct_init(void) {} static inline void delayacct_tsk_init(struct task_struct *tsk) {} static inline void delayacct_tsk_free(struct task_struct *tsk) {} static inline void delayacct_blkio_start(void) {} static inline void delayacct_blkio_end(struct task_struct *p) {} static inline int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { return 0; } static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) { return 0; } static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { return 0; } static inline void delayacct_freepages_start(void) {} static inline void delayacct_freepages_end(void) {} static inline void delayacct_thrashing_start(void) {} static inline void delayacct_thrashing_end(void) {} #endif /* CONFIG_TASK_DELAY_ACCT */ #endif
90 92 2 2 66 66 97 97 97 96 58 58 46 46 1 1 195 189 6 62 217 218 74 75 5 1 2 2 1 1 1 1 4 4 3 3 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 /* * net/tipc/node.c: TIPC node management routines * * Copyright (c) 2000-2006, 2012-2016, Ericsson AB * Copyright (c) 2005-2006, 2010-2014, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "link.h" #include "node.h" #include "name_distr.h" #include "socket.h" #include "bcast.h" #include "monitor.h" #include "discover.h" #include "netlink.h" #include "trace.h" #include "crypto.h" #define INVALID_NODE_SIG 0x10000 #define NODE_CLEANUP_AFTER 300000 /* Flags used to take different actions according to flag type * TIPC_NOTIFY_NODE_DOWN: notify node is down * TIPC_NOTIFY_NODE_UP: notify node is up * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type */ enum { TIPC_NOTIFY_NODE_DOWN = (1 << 3), TIPC_NOTIFY_NODE_UP = (1 << 4), TIPC_NOTIFY_LINK_UP = (1 << 6), TIPC_NOTIFY_LINK_DOWN = (1 << 7) }; struct tipc_link_entry { struct tipc_link *link; spinlock_t lock; /* per link */ u32 mtu; struct sk_buff_head inputq; struct tipc_media_addr maddr; }; struct tipc_bclink_entry { struct tipc_link *link; struct sk_buff_head inputq1; struct sk_buff_head arrvq; struct sk_buff_head inputq2; struct sk_buff_head namedq; u16 named_rcv_nxt; bool named_open; }; /** * struct tipc_node - TIPC node structure * @addr: network address of node * @kref: reference counter to node object * @lock: rwlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain * @inputq: pointer to input queue containing messages for msg event * @namedq: pointer to name table input queue with name table messages * @active_links: bearer ids of active links, used as index into links[] array * @links: array containing references to all links to node * @bc_entry: broadcast link entry * @action_flags: bit mask of different types of node actions * @state: connectivity state vs peer node * @preliminary: a preliminary node or not * @failover_sent: failover sent or not * @sync_point: sequence number where synch/failover is finished * @list: links to adjacent nodes in sorted list of cluster's nodes * @working_links: number of working links to node (both active and standby) * @link_cnt: number of links to node * @capabilities: bitmap, indicating peer node's functional capabilities * @signature: node instance identifier * @link_id: local and remote bearer ids of changing link, if any * @peer_id: 128-bit ID of peer * @peer_id_string: ID string of peer * @publ_list: list of publications * @conn_sks: list of connections (FIXME) * @timer: node's keepalive timer * @keepalive_intv: keepalive interval in milliseconds * @rcu: rcu struct for tipc_node * @delete_at: indicates the time for deleting a down node * @peer_net: peer's net namespace * @peer_hash_mix: hash for this peer (FIXME) * @crypto_rx: RX crypto handler */ struct tipc_node { u32 addr; struct kref kref; rwlock_t lock; struct net *net; struct hlist_node hash; int active_links[2]; struct tipc_link_entry links[MAX_BEARERS]; struct tipc_bclink_entry bc_entry; int action_flags; struct list_head list; int state; bool preliminary; bool failover_sent; u16 sync_point; int link_cnt; u16 working_links; u16 capabilities; u32 signature; u32 link_id; u8 peer_id[16]; char peer_id_string[NODE_ID_STR_LEN]; struct list_head publ_list; struct list_head conn_sks; unsigned long keepalive_intv; struct timer_list timer; struct rcu_head rcu; unsigned long delete_at; struct net *peer_net; u32 peer_hash_mix; #ifdef CONFIG_TIPC_CRYPTO struct tipc_crypto *crypto_rx; #endif }; /* Node FSM states and events: */ enum { SELF_DOWN_PEER_DOWN = 0xdd, SELF_UP_PEER_UP = 0xaa, SELF_DOWN_PEER_LEAVING = 0xd1, SELF_UP_PEER_COMING = 0xac, SELF_COMING_PEER_UP = 0xca, SELF_LEAVING_PEER_DOWN = 0x1d, NODE_FAILINGOVER = 0xf0, NODE_SYNCHING = 0xcc }; enum { SELF_ESTABL_CONTACT_EVT = 0xece, SELF_LOST_CONTACT_EVT = 0x1ce, PEER_ESTABL_CONTACT_EVT = 0x9ece, PEER_LOST_CONTACT_EVT = 0x91ce, NODE_FAILOVER_BEGIN_EVT = 0xfbe, NODE_FAILOVER_END_EVT = 0xfee, NODE_SYNCH_BEGIN_EVT = 0xcbe, NODE_SYNCH_END_EVT = 0xcee }; static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr); static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete); static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq); static void tipc_node_delete(struct tipc_node *node); static void tipc_node_timeout(struct timer_list *t); static void tipc_node_fsm_evt(struct tipc_node *n, int evt); static struct tipc_node *tipc_node_find(struct net *net, u32 addr); static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id); static bool node_is_up(struct tipc_node *n); static void tipc_node_delete_from_list(struct tipc_node *node); struct tipc_sock_conn { u32 port; u32 peer_port; u32 peer_node; struct list_head list; }; static struct tipc_link *node_active_link(struct tipc_node *n, int sel) { int bearer_id = n->active_links[sel & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) return NULL; return n->links[bearer_id].link; } int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel, bool connected) { struct tipc_node *n; int bearer_id; unsigned int mtu = MAX_MSG_SIZE; n = tipc_node_find(net, addr); if (unlikely(!n)) return mtu; /* Allow MAX_MSG_SIZE when building connection oriented message * if they are in the same core network */ if (n->peer_net && connected) { tipc_node_put(n); return mtu; } bearer_id = n->active_links[sel & 1]; if (likely(bearer_id != INVALID_BEARER_ID)) mtu = n->links[bearer_id].mtu; tipc_node_put(n); return mtu; } bool tipc_node_get_id(struct net *net, u32 addr, u8 *id) { u8 *own_id = tipc_own_id(net); struct tipc_node *n; if (!own_id) return true; if (addr == tipc_own_addr(net)) { memcpy(id, own_id, TIPC_NODEID_LEN); return true; } n = tipc_node_find(net, addr); if (!n) return false; memcpy(id, &n->peer_id, TIPC_NODEID_LEN); tipc_node_put(n); return true; } u16 tipc_node_get_capabilities(struct net *net, u32 addr) { struct tipc_node *n; u16 caps; n = tipc_node_find(net, addr); if (unlikely(!n)) return TIPC_NODE_CAPABILITIES; caps = n->capabilities; tipc_node_put(n); return caps; } u32 tipc_node_get_addr(struct tipc_node *node) { return (node) ? node->addr : 0; } char *tipc_node_get_id_str(struct tipc_node *node) { return node->peer_id_string; } #ifdef CONFIG_TIPC_CRYPTO /** * tipc_node_crypto_rx - Retrieve crypto RX handle from node * @__n: target tipc_node * Note: node ref counter must be held first! */ struct tipc_crypto *tipc_node_crypto_rx(struct tipc_node *__n) { return (__n) ? __n->crypto_rx : NULL; } struct tipc_crypto *tipc_node_crypto_rx_by_list(struct list_head *pos) { return container_of(pos, struct tipc_node, list)->crypto_rx; } struct tipc_crypto *tipc_node_crypto_rx_by_addr(struct net *net, u32 addr) { struct tipc_node *n; n = tipc_node_find(net, addr); return (n) ? n->crypto_rx : NULL; } #endif static void tipc_node_free(struct rcu_head *rp) { struct tipc_node *n = container_of(rp, struct tipc_node, rcu); #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_stop(&n->crypto_rx); #endif kfree(n); } static void tipc_node_kref_release(struct kref *kref) { struct tipc_node *n = container_of(kref, struct tipc_node, kref); kfree(n->bc_entry.link); call_rcu(&n->rcu, tipc_node_free); } void tipc_node_put(struct tipc_node *node) { kref_put(&node->kref, tipc_node_kref_release); } void tipc_node_get(struct tipc_node *node) { kref_get(&node->kref); } /* * tipc_node_find - locate specified node object, if it exists */ static struct tipc_node *tipc_node_find(struct net *net, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node; unsigned int thash = tipc_hashfn(addr); rcu_read_lock(); hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) { if (node->addr != addr || node->preliminary) continue; if (!kref_get_unless_zero(&node->kref)) node = NULL; break; } rcu_read_unlock(); return node; } /* tipc_node_find_by_id - locate specified node object by its 128-bit id * Note: this function is called only when a discovery request failed * to find the node by its 32-bit id, and is not time critical */ static struct tipc_node *tipc_node_find_by_id(struct net *net, u8 *id) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool found = false; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { read_lock_bh(&n->lock); if (!memcmp(id, n->peer_id, 16) && kref_get_unless_zero(&n->kref)) found = true; read_unlock_bh(&n->lock); if (found) break; } rcu_read_unlock(); return found ? n : NULL; } static void tipc_node_read_lock(struct tipc_node *n) __acquires(n->lock) { read_lock_bh(&n->lock); } static void tipc_node_read_unlock(struct tipc_node *n) __releases(n->lock) { read_unlock_bh(&n->lock); } static void tipc_node_write_lock(struct tipc_node *n) __acquires(n->lock) { write_lock_bh(&n->lock); } static void tipc_node_write_unlock_fast(struct tipc_node *n) __releases(n->lock) { write_unlock_bh(&n->lock); } static void tipc_node_write_unlock(struct tipc_node *n) __releases(n->lock) { struct tipc_socket_addr sk; struct net *net = n->net; u32 flags = n->action_flags; struct list_head *publ_list; struct tipc_uaddr ua; u32 bearer_id, node; if (likely(!flags)) { write_unlock_bh(&n->lock); return; } tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE, TIPC_LINK_STATE, n->addr, n->addr); sk.ref = n->link_id; sk.node = tipc_own_addr(net); node = n->addr; bearer_id = n->link_id & 0xffff; publ_list = &n->publ_list; n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP | TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP); write_unlock_bh(&n->lock); if (flags & TIPC_NOTIFY_NODE_DOWN) tipc_publ_notify(net, publ_list, node, n->capabilities); if (flags & TIPC_NOTIFY_NODE_UP) tipc_named_node_up(net, node, n->capabilities); if (flags & TIPC_NOTIFY_LINK_UP) { tipc_mon_peer_up(net, node, bearer_id); tipc_nametbl_publish(net, &ua, &sk, sk.ref); } if (flags & TIPC_NOTIFY_LINK_DOWN) { tipc_mon_peer_down(net, node, bearer_id); tipc_nametbl_withdraw(net, &ua, &sk, sk.ref); } } static void tipc_node_assign_peer_net(struct tipc_node *n, u32 hash_mixes) { int net_id = tipc_netid(n->net); struct tipc_net *tn_peer; struct net *tmp; u32 hash_chk; if (n->peer_net) return; for_each_net_rcu(tmp) { tn_peer = tipc_net(tmp); if (!tn_peer) continue; /* Integrity checking whether node exists in namespace or not */ if (tn_peer->net_id != net_id) continue; if (memcmp(n->peer_id, tn_peer->node_id, NODE_ID_LEN)) continue; hash_chk = tipc_net_hash_mixes(tmp, tn_peer->random); if (hash_mixes ^ hash_chk) continue; n->peer_net = tmp; n->peer_hash_mix = hash_mixes; break; } } struct tipc_node *tipc_node_create(struct net *net, u32 addr, u8 *peer_id, u16 capabilities, u32 hash_mixes, bool preliminary) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l, *snd_l = tipc_bc_sndlink(net); struct tipc_node *n, *temp_node; unsigned long intv; int bearer_id; int i; spin_lock_bh(&tn->node_list_lock); n = tipc_node_find(net, addr) ?: tipc_node_find_by_id(net, peer_id); if (n) { if (!n->preliminary) goto update; if (preliminary) goto exit; /* A preliminary node becomes "real" now, refresh its data */ tipc_node_write_lock(n); if (!tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link refresh failed, no memory\n"); tipc_node_write_unlock_fast(n); tipc_node_put(n); n = NULL; goto exit; } n->preliminary = false; n->addr = addr; hlist_del_rcu(&n->hash); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_del_rcu(&n->list); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); tipc_node_write_unlock_fast(n); update: if (n->peer_hash_mix ^ hash_mixes) tipc_node_assign_peer_net(n, hash_mixes); if (n->capabilities == capabilities) goto exit; /* Same node may come back with new capabilities */ tipc_node_write_lock(n); n->capabilities = capabilities; for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) { l = n->links[bearer_id].link; if (l) tipc_link_update_caps(l, capabilities); } tipc_node_write_unlock_fast(n); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); goto exit; } n = kzalloc(sizeof(*n), GFP_ATOMIC); if (!n) { pr_warn("Node creation failed, no memory\n"); goto exit; } tipc_nodeid2string(n->peer_id_string, peer_id); #ifdef CONFIG_TIPC_CRYPTO if (unlikely(tipc_crypto_start(&n->crypto_rx, net, n))) { pr_warn("Failed to start crypto RX(%s)!\n", n->peer_id_string); kfree(n); n = NULL; goto exit; } #endif n->addr = addr; n->preliminary = preliminary; memcpy(&n->peer_id, peer_id, 16); n->net = net; n->peer_net = NULL; n->peer_hash_mix = 0; /* Assign kernel local namespace if exists */ tipc_node_assign_peer_net(n, hash_mixes); n->capabilities = capabilities; kref_init(&n->kref); rwlock_init(&n->lock); INIT_HLIST_NODE(&n->hash); INIT_LIST_HEAD(&n->list); INIT_LIST_HEAD(&n->publ_list); INIT_LIST_HEAD(&n->conn_sks); skb_queue_head_init(&n->bc_entry.namedq); skb_queue_head_init(&n->bc_entry.inputq1); __skb_queue_head_init(&n->bc_entry.arrvq); skb_queue_head_init(&n->bc_entry.inputq2); for (i = 0; i < MAX_BEARERS; i++) spin_lock_init(&n->links[i].lock); n->state = SELF_DOWN_PEER_LEAVING; n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); n->signature = INVALID_NODE_SIG; n->active_links[0] = INVALID_BEARER_ID; n->active_links[1] = INVALID_BEARER_ID; if (!preliminary && !tipc_link_bc_create(net, tipc_own_addr(net), addr, peer_id, U16_MAX, tipc_link_min_win(snd_l), tipc_link_max_win(snd_l), n->capabilities, &n->bc_entry.inputq1, &n->bc_entry.namedq, snd_l, &n->bc_entry.link)) { pr_warn("Broadcast rcv link creation failed, no memory\n"); tipc_node_put(n); n = NULL; goto exit; } tipc_node_get(n); timer_setup(&n->timer, tipc_node_timeout, 0); /* Start a slow timer anyway, crypto needs it */ n->keepalive_intv = 10000; intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]); list_for_each_entry_rcu(temp_node, &tn->node_list, list) { if (n->addr < temp_node->addr) break; } list_add_tail_rcu(&n->list, &temp_node->list); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); trace_tipc_node_create(n, true, " "); exit: spin_unlock_bh(&tn->node_list_lock); return n; } static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l) { unsigned long tol = tipc_link_tolerance(l); unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4; /* Link with lowest tolerance determines timer interval */ if (intv < n->keepalive_intv) n->keepalive_intv = intv; /* Ensure link's abort limit corresponds to current tolerance */ tipc_link_set_abort_limit(l, tol / n->keepalive_intv); } static void tipc_node_delete_from_list(struct tipc_node *node) { #ifdef CONFIG_TIPC_CRYPTO tipc_crypto_key_flush(node->crypto_rx); #endif list_del_rcu(&node->list); hlist_del_rcu(&node->hash); tipc_node_put(node); } static void tipc_node_delete(struct tipc_node *node) { trace_tipc_node_delete(node, true, " "); tipc_node_delete_from_list(node); del_timer_sync(&node->timer); tipc_node_put(node); } void tipc_node_stop(struct net *net) { struct tipc_net *tn = tipc_net(net); struct tipc_node *node, *t_node; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_safe(node, t_node, &tn->node_list, list) tipc_node_delete(node); spin_unlock_bh(&tn->node_list_lock); } void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node subscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_add_tail(subscr, &n->publ_list); tipc_node_write_unlock_fast(n); tipc_node_put(n); } void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr) { struct tipc_node *n; if (in_own_node(net, addr)) return; n = tipc_node_find(net, addr); if (!n) { pr_warn("Node unsubscribe rejected, unknown node 0x%x\n", addr); return; } tipc_node_write_lock(n); list_del_init(subscr); tipc_node_write_unlock_fast(n); tipc_node_put(n); } int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port) { struct tipc_node *node; struct tipc_sock_conn *conn; int err = 0; if (in_own_node(net, dnode)) return 0; node = tipc_node_find(net, dnode); if (!node) { pr_warn("Connecting sock to node 0x%x failed\n", dnode); return -EHOSTUNREACH; } conn = kmalloc(sizeof(*conn), GFP_ATOMIC); if (!conn) { err = -EHOSTUNREACH; goto exit; } conn->peer_node = dnode; conn->port = port; conn->peer_port = peer_port; tipc_node_write_lock(node); list_add_tail(&conn->list, &node->conn_sks); tipc_node_write_unlock(node); exit: tipc_node_put(node); return err; } void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port) { struct tipc_node *node; struct tipc_sock_conn *conn, *safe; if (in_own_node(net, dnode)) return; node = tipc_node_find(net, dnode); if (!node) return; tipc_node_write_lock(node); list_for_each_entry_safe(conn, safe, &node->conn_sks, list) { if (port != conn->port) continue; list_del(&conn->list); kfree(conn); } tipc_node_write_unlock(node); tipc_node_put(node); } static void tipc_node_clear_links(struct tipc_node *node) { int i; for (i = 0; i < MAX_BEARERS; i++) { struct tipc_link_entry *le = &node->links[i]; if (le->link) { kfree(le->link); le->link = NULL; node->link_cnt--; } } } /* tipc_node_cleanup - delete nodes that does not * have active links for NODE_CLEANUP_AFTER time */ static bool tipc_node_cleanup(struct tipc_node *peer) { struct tipc_node *temp_node; struct tipc_net *tn = tipc_net(peer->net); bool deleted = false; /* If lock held by tipc_node_stop() the node will be deleted anyway */ if (!spin_trylock_bh(&tn->node_list_lock)) return false; tipc_node_write_lock(peer); if (!node_is_up(peer) && time_after(jiffies, peer->delete_at)) { tipc_node_clear_links(peer); tipc_node_delete_from_list(peer); deleted = true; } tipc_node_write_unlock(peer); if (!deleted) { spin_unlock_bh(&tn->node_list_lock); return deleted; } /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(peer->net, (tn->capabilities & TIPC_BCAST_RCAST)); spin_unlock_bh(&tn->node_list_lock); return deleted; } /* tipc_node_timeout - handle expiration of node timer */ static void tipc_node_timeout(struct timer_list *t) { struct tipc_node *n = from_timer(n, t, timer); struct tipc_link_entry *le; struct sk_buff_head xmitq; int remains = n->link_cnt; int bearer_id; int rc = 0; trace_tipc_node_timeout(n, false, " "); if (!node_is_up(n) && tipc_node_cleanup(n)) { /*Removing the reference of Timer*/ tipc_node_put(n); return; } #ifdef CONFIG_TIPC_CRYPTO /* Take any crypto key related actions first */ tipc_crypto_timeout(n->crypto_rx); #endif __skb_queue_head_init(&xmitq); /* Initial node interval to value larger (10 seconds), then it will be * recalculated with link lowest tolerance */ tipc_node_read_lock(n); n->keepalive_intv = 10000; tipc_node_read_unlock(n); for (bearer_id = 0; remains && (bearer_id < MAX_BEARERS); bearer_id++) { tipc_node_read_lock(n); le = &n->links[bearer_id]; if (le->link) { spin_lock_bh(&le->lock); /* Link tolerance may change asynchronously: */ tipc_node_calculate_timer(n, le->link); rc = tipc_link_timeout(le->link, &xmitq); spin_unlock_bh(&le->lock); remains--; } tipc_node_read_unlock(n); tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr, n); if (rc & TIPC_LINK_DOWN_EVT) tipc_node_link_down(n, bearer_id, false); } mod_timer(&n->timer, jiffies + msecs_to_jiffies(n->keepalive_intv)); } /** * __tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * Node lock must be held by caller * Link becomes active (alone or shared) or standby, depending on its priority. */ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; struct tipc_link *ol = node_active_link(n, 0); struct tipc_link *nl = n->links[bearer_id].link; if (!nl || tipc_link_is_up(nl)) return; tipc_link_fsm_evt(nl, LINK_ESTABLISH_EVT); if (!tipc_link_is_up(nl)) return; n->working_links++; n->action_flags |= TIPC_NOTIFY_LINK_UP; n->link_id = tipc_link_id(nl); /* Leave room for tunnel header when returning 'mtu' to users: */ n->links[bearer_id].mtu = tipc_link_mss(nl); tipc_bearer_add_dest(n->net, bearer_id, n->addr); tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id); pr_debug("Established link <%s> on network plane %c\n", tipc_link_name(nl), tipc_link_plane(nl)); trace_tipc_node_link_up(n, true, " "); /* Ensure that a STATE message goes first */ tipc_link_build_state_msg(nl, xmitq); /* First link? => give it both slots */ if (!ol) { *slot0 = bearer_id; *slot1 = bearer_id; tipc_node_fsm_evt(n, SELF_ESTABL_CONTACT_EVT); n->action_flags |= TIPC_NOTIFY_NODE_UP; tipc_link_set_active(nl, true); tipc_bcast_add_peer(n->net, nl, xmitq); return; } /* Second link => redistribute slots */ if (tipc_link_prio(nl) > tipc_link_prio(ol)) { pr_debug("Old link <%s> becomes standby\n", tipc_link_name(ol)); *slot0 = bearer_id; *slot1 = bearer_id; tipc_link_set_active(nl, true); tipc_link_set_active(ol, false); } else if (tipc_link_prio(nl) == tipc_link_prio(ol)) { tipc_link_set_active(nl, true); *slot1 = bearer_id; } else { pr_debug("New link <%s> is standby\n", tipc_link_name(nl)); } /* Prepare synchronization with first link */ tipc_link_tnl_prepare(ol, nl, SYNCH_MSG, xmitq); } /** * tipc_node_link_up - handle addition of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * * Link becomes active (alone or shared) or standby, depending on its priority. */ static void tipc_node_link_up(struct tipc_node *n, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_media_addr *maddr; tipc_node_write_lock(n); __tipc_node_link_up(n, bearer_id, xmitq); maddr = &n->links[bearer_id].maddr; tipc_bearer_xmit(n->net, bearer_id, xmitq, maddr, n); tipc_node_write_unlock(n); } /** * tipc_node_link_failover() - start failover in case "half-failover" * * This function is only called in a very special situation where link * failover can be already started on peer node but not on this node. * This can happen when e.g.:: * * 1. Both links <1A-2A>, <1B-2B> down * 2. Link endpoint 2A up, but 1A still down (e.g. due to network * disturbance, wrong session, etc.) * 3. Link <1B-2B> up * 4. Link endpoint 2A down (e.g. due to link tolerance timeout) * 5. Node 2 starts failover onto link <1B-2B> * * ==> Node 1 does never start link/node failover! * * @n: tipc node structure * @l: link peer endpoint failingover (- can be NULL) * @tnl: tunnel link * @xmitq: queue for messages to be xmited on tnl link later */ static void tipc_node_link_failover(struct tipc_node *n, struct tipc_link *l, struct tipc_link *tnl, struct sk_buff_head *xmitq) { /* Avoid to be "self-failover" that can never end */ if (!tipc_link_is_up(tnl)) return; /* Don't rush, failure link may be in the process of resetting */ if (l && !tipc_link_is_reset(l)) return; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_failover_prepare(l, tnl, xmitq); if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); } /** * __tipc_node_link_down - handle loss of link * @n: target tipc_node * @bearer_id: id of the bearer * @xmitq: queue for messages to be xmited on * @maddr: output media address of the bearer */ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id, struct sk_buff_head *xmitq, struct tipc_media_addr **maddr) { struct tipc_link_entry *le = &n->links[*bearer_id]; int *slot0 = &n->active_links[0]; int *slot1 = &n->active_links[1]; int i, highest = 0, prio; struct tipc_link *l, *_l, *tnl; l = n->links[*bearer_id].link; if (!l || tipc_link_is_reset(l)) return; n->working_links--; n->action_flags |= TIPC_NOTIFY_LINK_DOWN; n->link_id = tipc_link_id(l); tipc_bearer_remove_dest(n->net, *bearer_id, n->addr); pr_debug("Lost link <%s> on network plane %c\n", tipc_link_name(l), tipc_link_plane(l)); /* Select new active link if any available */ *slot0 = INVALID_BEARER_ID; *slot1 = INVALID_BEARER_ID; for (i = 0; i < MAX_BEARERS; i++) { _l = n->links[i].link; if (!_l || !tipc_link_is_up(_l)) continue; if (_l == l) continue; prio = tipc_link_prio(_l); if (prio < highest) continue; if (prio > highest) { highest = prio; *slot0 = i; *slot1 = i; continue; } *slot1 = i; } if (!node_is_up(n)) { if (tipc_link_peer_is_down(l)) tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); tipc_node_fsm_evt(n, SELF_LOST_CONTACT_EVT); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down!"); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_reset(l); tipc_link_build_reset_msg(l, xmitq); *maddr = &n->links[*bearer_id].maddr; node_lost_contact(n, &le->inputq); tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); return; } tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id); /* There is still a working link => initiate failover */ *bearer_id = n->active_links[0]; tnl = n->links[*bearer_id].link; tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1); tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq); trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link down -> failover!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT); *maddr = &n->links[*bearer_id].maddr; } static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete) { struct tipc_link_entry *le = &n->links[bearer_id]; struct tipc_media_addr *maddr = NULL; struct tipc_link *l = le->link; int old_bearer_id = bearer_id; struct sk_buff_head xmitq; if (!l) return; __skb_queue_head_init(&xmitq); tipc_node_write_lock(n); if (!tipc_link_is_establishing(l)) { __tipc_node_link_down(n, &bearer_id, &xmitq, &maddr); } else { /* Defuse pending tipc_node_link_up() */ tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); } if (delete) { kfree(l); le->link = NULL; n->link_cnt--; } trace_tipc_node_link_down(n, true, "node link down or deleted!"); tipc_node_write_unlock(n); if (delete) tipc_mon_remove_peer(n->net, n->addr, old_bearer_id); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr, n); tipc_sk_rcv(n->net, &le->inputq); } static bool node_is_up(struct tipc_node *n) { return n->active_links[0] != INVALID_BEARER_ID; } bool tipc_node_is_up(struct net *net, u32 addr) { struct tipc_node *n; bool retval = false; if (in_own_node(net, addr)) return true; n = tipc_node_find(net, addr); if (!n) return false; retval = node_is_up(n); tipc_node_put(n); return retval; } static u32 tipc_node_suggest_addr(struct net *net, u32 addr) { struct tipc_node *n; addr ^= tipc_net(net)->random; while ((n = tipc_node_find(net, addr))) { tipc_node_put(n); addr++; } return addr; } /* tipc_node_try_addr(): Check if addr can be used by peer, suggest other if not * Returns suggested address if any, otherwise 0 */ u32 tipc_node_try_addr(struct net *net, u8 *id, u32 addr) { struct tipc_net *tn = tipc_net(net); struct tipc_node *n; bool preliminary; u32 sugg_addr; /* Suggest new address if some other peer is using this one */ n = tipc_node_find(net, addr); if (n) { if (!memcmp(n->peer_id, id, NODE_ID_LEN)) addr = 0; tipc_node_put(n); if (!addr) return 0; return tipc_node_suggest_addr(net, addr); } /* Suggest previously used address if peer is known */ n = tipc_node_find_by_id(net, id); if (n) { sugg_addr = n->addr; preliminary = n->preliminary; tipc_node_put(n); if (!preliminary) return sugg_addr; } /* Even this node may be in conflict */ if (tn->trial_addr == addr) return tipc_node_suggest_addr(net, addr); return 0; } void tipc_node_check_dest(struct net *net, u32 addr, u8 *peer_id, struct tipc_bearer *b, u16 capabilities, u32 signature, u32 hash_mixes, struct tipc_media_addr *maddr, bool *respond, bool *dupl_addr) { struct tipc_node *n; struct tipc_link *l; struct tipc_link_entry *le; bool addr_match = false; bool sign_match = false; bool link_up = false; bool link_is_reset = false; bool accept_addr = false; bool reset = false; char *if_name; unsigned long intv; u16 session; *dupl_addr = false; *respond = false; n = tipc_node_create(net, addr, peer_id, capabilities, hash_mixes, false); if (!n) return; tipc_node_write_lock(n); le = &n->links[b->identity]; /* Prepare to validate requesting node's signature and media address */ l = le->link; link_up = l && tipc_link_is_up(l); link_is_reset = l && tipc_link_is_reset(l); addr_match = l && !memcmp(&le->maddr, maddr, sizeof(*maddr)); sign_match = (signature == n->signature); /* These three flags give us eight permutations: */ if (sign_match && addr_match && link_up) { /* All is fine. Ignore requests. */ /* Peer node is not a container/local namespace */ if (!n->peer_hash_mix) n->peer_hash_mix = hash_mixes; } else if (sign_match && addr_match && !link_up) { /* Respond. The link will come up in due time */ *respond = true; } else if (sign_match && !addr_match && link_up) { /* Peer has changed i/f address without rebooting. * If so, the link will reset soon, and the next * discovery will be accepted. So we can ignore it. * It may also be a cloned or malicious peer having * chosen the same node address and signature as an * existing one. * Ignore requests until the link goes down, if ever. */ *dupl_addr = true; } else if (sign_match && !addr_match && !link_up) { /* Peer link has changed i/f address without rebooting. * It may also be a cloned or malicious peer; we can't * distinguish between the two. * The signature is correct, so we must accept. */ accept_addr = true; *respond = true; reset = true; } else if (!sign_match && addr_match && link_up) { /* Peer node rebooted. Two possibilities: * - Delayed re-discovery; this link endpoint has already * reset and re-established contact with the peer, before * receiving a discovery message from that node. * (The peer happened to receive one from this node first). * - The peer came back so fast that our side has not * discovered it yet. Probing from this side will soon * reset the link, since there can be no working link * endpoint at the peer end, and the link will re-establish. * Accept the signature, since it comes from a known peer. */ n->signature = signature; } else if (!sign_match && addr_match && !link_up) { /* The peer node has rebooted. * Accept signature, since it is a known peer. */ n->signature = signature; *respond = true; } else if (!sign_match && !addr_match && link_up) { /* Peer rebooted with new address, or a new/duplicate peer. * Ignore until the link goes down, if ever. */ *dupl_addr = true; } else if (!sign_match && !addr_match && !link_up) { /* Peer rebooted with new address, or it is a new peer. * Accept signature and address. */ n->signature = signature; accept_addr = true; *respond = true; reset = true; } if (!accept_addr) goto exit; /* Now create new link if not already existing */ if (!l) { if (n->link_cnt == 2) goto exit; if_name = strchr(b->name, ':') + 1; get_random_bytes(&session, sizeof(u16)); if (!tipc_link_create(net, if_name, b->identity, b->tolerance, b->net_plane, b->mtu, b->priority, b->min_win, b->max_win, session, tipc_own_addr(net), addr, peer_id, n->capabilities, tipc_bc_sndlink(n->net), n->bc_entry.link, &le->inputq, &n->bc_entry.namedq, &l)) { *respond = false; goto exit; } trace_tipc_link_reset(l, TIPC_DUMP_ALL, "link created!"); tipc_link_reset(l); tipc_link_fsm_evt(l, LINK_RESET_EVT); if (n->state == NODE_FAILINGOVER) tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT); link_is_reset = tipc_link_is_reset(l); le->link = l; n->link_cnt++; tipc_node_calculate_timer(n, l); if (n->link_cnt == 1) { intv = jiffies + msecs_to_jiffies(n->keepalive_intv); if (!mod_timer(&n->timer, intv)) tipc_node_get(n); } } memcpy(&le->maddr, maddr, sizeof(*maddr)); exit: tipc_node_write_unlock(n); if (reset && !link_is_reset) tipc_node_link_down(n, b->identity, false); tipc_node_put(n); } void tipc_node_delete_links(struct net *net, int bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *n; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_link_down(n, bearer_id, true); } rcu_read_unlock(); } static void tipc_node_reset_links(struct tipc_node *n) { int i; pr_warn("Resetting all links to %x\n", n->addr); trace_tipc_node_reset_links(n, true, " "); for (i = 0; i < MAX_BEARERS; i++) { tipc_node_link_down(n, i, false); } } /* tipc_node_fsm_evt - node finite state machine * Determines when contact is allowed with peer node */ static void tipc_node_fsm_evt(struct tipc_node *n, int evt) { int state = n->state; switch (state) { case SELF_DOWN_PEER_DOWN: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_COMING; break; case PEER_ESTABL_CONTACT_EVT: state = SELF_COMING_PEER_UP; break; case SELF_LOST_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_UP: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_BEGIN_EVT: state = NODE_SYNCHING; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_END_EVT: break; default: goto illegal_evt; } break; case SELF_DOWN_PEER_LEAVING: switch (evt) { case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case SELF_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_UP_PEER_COMING: switch (evt) { case PEER_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: case NODE_SYNCH_END_EVT: case NODE_FAILOVER_BEGIN_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_COMING_PEER_UP: switch (evt) { case SELF_ESTABL_CONTACT_EVT: state = SELF_UP_PEER_UP; break; case PEER_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_LOST_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case SELF_LEAVING_PEER_DOWN: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_DOWN; break; case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: case PEER_LOST_CONTACT_EVT: break; case NODE_SYNCH_END_EVT: case NODE_SYNCH_BEGIN_EVT: case NODE_FAILOVER_BEGIN_EVT: case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; case NODE_FAILINGOVER: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_FAILOVER_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_SYNCH_BEGIN_EVT: case NODE_SYNCH_END_EVT: default: goto illegal_evt; } break; case NODE_SYNCHING: switch (evt) { case SELF_LOST_CONTACT_EVT: state = SELF_DOWN_PEER_LEAVING; break; case PEER_LOST_CONTACT_EVT: state = SELF_LEAVING_PEER_DOWN; break; case NODE_SYNCH_END_EVT: state = SELF_UP_PEER_UP; break; case NODE_FAILOVER_BEGIN_EVT: state = NODE_FAILINGOVER; break; case NODE_SYNCH_BEGIN_EVT: case SELF_ESTABL_CONTACT_EVT: case PEER_ESTABL_CONTACT_EVT: break; case NODE_FAILOVER_END_EVT: default: goto illegal_evt; } break; default: pr_err("Unknown node fsm state %x\n", state); break; } trace_tipc_node_fsm(n->peer_id, n->state, state, evt); n->state = state; return; illegal_evt: pr_err("Illegal node fsm evt %x in state %x\n", evt, state); trace_tipc_node_fsm(n->peer_id, n->state, state, evt); } static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq) { struct tipc_sock_conn *conn, *safe; struct tipc_link *l; struct list_head *conns = &n->conn_sks; struct sk_buff *skb; uint i; pr_debug("Lost contact with %x\n", n->addr); n->delete_at = jiffies + msecs_to_jiffies(NODE_CLEANUP_AFTER); trace_tipc_node_lost_contact(n, true, " "); /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); skb_queue_purge(&n->bc_entry.namedq); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l) tipc_link_fsm_evt(l, LINK_FAILOVER_END_EVT); } /* Notify publications from this node */ n->action_flags |= TIPC_NOTIFY_NODE_DOWN; n->peer_net = NULL; n->peer_hash_mix = 0; /* Notify sockets connected to node */ list_for_each_entry_safe(conn, safe, conns, list) { skb = tipc_msg_create(TIPC_CRITICAL_IMPORTANCE, TIPC_CONN_MSG, SHORT_H_SIZE, 0, tipc_own_addr(n->net), conn->peer_node, conn->port, conn->peer_port, TIPC_ERR_NO_NODE); if (likely(skb)) skb_queue_tail(inputq, skb); list_del(&conn->list); kfree(conn); } } /** * tipc_node_get_linkname - get the name of a link * * @net: the applicable net namespace * @bearer_id: id of the bearer * @addr: peer node address * @linkname: link name output buffer * @len: size of @linkname output buffer * * Return: 0 on success */ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr, char *linkname, size_t len) { struct tipc_link *link; int err = -EINVAL; struct tipc_node *node = tipc_node_find(net, addr); if (!node) return err; if (bearer_id >= MAX_BEARERS) goto exit; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (link) { strncpy(linkname, tipc_link_name(link), len); err = 0; } tipc_node_read_unlock(node); exit: tipc_node_put(node); return err; } /* Caller should hold node lock for the passed node */ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node) { void *hdr; struct nlattr *attrs; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, NLM_F_MULTI, TIPC_NL_NODE_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_NODE); if (!attrs) goto msg_full; if (nla_put_u32(msg->skb, TIPC_NLA_NODE_ADDR, node->addr)) goto attr_msg_full; if (node_is_up(node)) if (nla_put_flag(msg->skb, TIPC_NLA_NODE_UP)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } static void tipc_lxc_xmit(struct net *peer_net, struct sk_buff_head *list) { struct tipc_msg *hdr = buf_msg(skb_peek(list)); struct sk_buff_head inputq; switch (msg_user(hdr)) { case TIPC_LOW_IMPORTANCE: case TIPC_MEDIUM_IMPORTANCE: case TIPC_HIGH_IMPORTANCE: case TIPC_CRITICAL_IMPORTANCE: if (msg_connected(hdr) || msg_named(hdr) || msg_direct(hdr)) { tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; } if (msg_mcast(hdr)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); return; } return; case MSG_FRAGMENTER: if (tipc_msg_assemble(list)) { tipc_loopback_trace(peer_net, list); skb_queue_head_init(&inputq); tipc_sk_mcast_rcv(peer_net, list, &inputq); __skb_queue_purge(list); skb_queue_purge(&inputq); } return; case GROUP_PROTOCOL: case CONN_MANAGER: tipc_loopback_trace(peer_net, list); spin_lock_init(&list->lock); tipc_sk_rcv(peer_net, list); return; case LINK_PROTOCOL: case NAME_DISTRIBUTOR: case TUNNEL_PROTOCOL: case BCAST_PROTOCOL: return; default: return; } } /** * tipc_node_xmit() - general link level function for message sending * @net: the applicable net namespace * @list: chain of buffers containing message * @dnode: address of destination node * @selector: a number used for deterministic link selection * Consumes the buffer chain. * Return: 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF */ int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode, int selector) { struct tipc_link_entry *le = NULL; struct tipc_node *n; struct sk_buff_head xmitq; bool node_up = false; struct net *peer_net; int bearer_id; int rc; if (in_own_node(net, dnode)) { tipc_loopback_trace(net, list); spin_lock_init(&list->lock); tipc_sk_rcv(net, list); return 0; } n = tipc_node_find(net, dnode); if (unlikely(!n)) { __skb_queue_purge(list); return -EHOSTUNREACH; } rcu_read_lock(); tipc_node_read_lock(n); node_up = node_is_up(n); peer_net = n->peer_net; tipc_node_read_unlock(n); if (node_up && peer_net && check_net(peer_net)) { /* xmit inner linux container */ tipc_lxc_xmit(peer_net, list); if (likely(skb_queue_empty(list))) { rcu_read_unlock(); tipc_node_put(n); return 0; } } rcu_read_unlock(); tipc_node_read_lock(n); bearer_id = n->active_links[selector & 1]; if (unlikely(bearer_id == INVALID_BEARER_ID)) { tipc_node_read_unlock(n); tipc_node_put(n); __skb_queue_purge(list); return -EHOSTUNREACH; } __skb_queue_head_init(&xmitq); le = &n->links[bearer_id]; spin_lock_bh(&le->lock); rc = tipc_link_xmit(le->link, list, &xmitq); spin_unlock_bh(&le->lock); tipc_node_read_unlock(n); if (unlikely(rc == -ENOBUFS)) tipc_node_link_down(n, bearer_id, false); else tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); tipc_node_put(n); return rc; } /* tipc_node_xmit_skb(): send single buffer to destination * Buffers sent via this function are generally TIPC_SYSTEM_IMPORTANCE * messages, which will not be rejected * The only exception is datagram messages rerouted after secondary * lookup, which are rare and safe to dispose of anyway. */ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode, u32 selector) { struct sk_buff_head head; __skb_queue_head_init(&head); __skb_queue_tail(&head, skb); tipc_node_xmit(net, &head, dnode, selector); return 0; } /* tipc_node_distr_xmit(): send single buffer msgs to individual destinations * Note: this is only for SYSTEM_IMPORTANCE messages, which cannot be rejected */ int tipc_node_distr_xmit(struct net *net, struct sk_buff_head *xmitq) { struct sk_buff *skb; u32 selector, dnode; while ((skb = __skb_dequeue(xmitq))) { selector = msg_origport(buf_msg(skb)); dnode = msg_destnode(buf_msg(skb)); tipc_node_xmit_skb(net, skb, dnode, selector); } return 0; } void tipc_node_broadcast(struct net *net, struct sk_buff *skb, int rc_dests) { struct sk_buff_head xmitq; struct sk_buff *txskb; struct tipc_node *n; u16 dummy; u32 dst; /* Use broadcast if all nodes support it */ if (!rc_dests && tipc_bcast_get_mode(net) != BCLINK_MODE_RCAST) { __skb_queue_head_init(&xmitq); __skb_queue_tail(&xmitq, skb); tipc_bcast_xmit(net, &xmitq, &dummy); return; } /* Otherwise use legacy replicast method */ rcu_read_lock(); list_for_each_entry_rcu(n, tipc_nodes(net), list) { dst = n->addr; if (in_own_node(net, dst)) continue; if (!node_is_up(n)) continue; txskb = pskb_copy(skb, GFP_ATOMIC); if (!txskb) break; msg_set_destnode(buf_msg(txskb), dst); tipc_node_xmit_skb(net, txskb, dst, 0); } rcu_read_unlock(); kfree_skb(skb); } static void tipc_node_mcast_rcv(struct tipc_node *n) { struct tipc_bclink_entry *be = &n->bc_entry; /* 'arrvq' is under inputq2's lock protection */ spin_lock_bh(&be->inputq2.lock); spin_lock_bh(&be->inputq1.lock); skb_queue_splice_tail_init(&be->inputq1, &be->arrvq); spin_unlock_bh(&be->inputq1.lock); spin_unlock_bh(&be->inputq2.lock); tipc_sk_mcast_rcv(n->net, &be->arrvq, &be->inputq2); } static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_link *ucl; int rc; rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq); if (rc & TIPC_LINK_DOWN_EVT) { tipc_node_reset_links(n); return; } if (!(rc & TIPC_LINK_SND_STATE)) return; /* If probe message, a STATE response will be sent anyway */ if (msg_probe(hdr)) return; /* Produce a STATE message carrying broadcast NACK */ tipc_node_read_lock(n); ucl = n->links[bearer_id].link; if (ucl) tipc_link_build_state_msg(ucl, xmitq); tipc_node_read_unlock(n); } /** * tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @bearer_id: id of bearer message arrived on * * Invoked with no locks held. */ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id) { int rc; struct sk_buff_head xmitq; struct tipc_bclink_entry *be; struct tipc_link_entry *le; struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); u32 dnode = msg_destnode(hdr); struct tipc_node *n; __skb_queue_head_init(&xmitq); /* If NACK for other node, let rcv link for that node peek into it */ if ((usr == BCAST_PROTOCOL) && (dnode != tipc_own_addr(net))) n = tipc_node_find(net, dnode); else n = tipc_node_find(net, msg_prevnode(hdr)); if (!n) { kfree_skb(skb); return; } be = &n->bc_entry; le = &n->links[bearer_id]; rc = tipc_bcast_rcv(net, be->link, skb); /* Broadcast ACKs are sent on a unicast link */ if (rc & TIPC_LINK_SND_STATE) { tipc_node_read_lock(n); tipc_link_build_state_msg(le->link, &xmitq); tipc_node_read_unlock(n); } if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); if (!skb_queue_empty(&be->inputq1)) tipc_node_mcast_rcv(n); /* Handle NAME_DISTRIBUTOR messages sent from 1.7 nodes */ if (!skb_queue_empty(&n->bc_entry.namedq)) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); /* If reassembly or retransmission failure => reset all links to peer */ if (rc & TIPC_LINK_DOWN_EVT) tipc_node_reset_links(n); tipc_node_put(n); } /** * tipc_node_check_state - check and if necessary update node state * @n: target tipc_node * @skb: TIPC packet * @bearer_id: identity of bearer delivering the packet * @xmitq: queue for messages to be xmited on * Return: true if state and msg are ok, otherwise false */ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb, int bearer_id, struct sk_buff_head *xmitq) { struct tipc_msg *hdr = buf_msg(skb); int usr = msg_user(hdr); int mtyp = msg_type(hdr); u16 oseqno = msg_seqno(hdr); u16 exp_pkts = msg_msgcnt(hdr); u16 rcv_nxt, syncpt, dlv_nxt, inputq_len; int state = n->state; struct tipc_link *l, *tnl, *pl = NULL; struct tipc_media_addr *maddr; int pb_id; if (trace_tipc_node_check_state_enabled()) { trace_tipc_skb_dump(skb, false, "skb for node state check"); trace_tipc_node_check_state(n, true, " "); } l = n->links[bearer_id].link; if (!l) return false; rcv_nxt = tipc_link_rcv_nxt(l); if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) return true; /* Find parallel link, if any */ for (pb_id = 0; pb_id < MAX_BEARERS; pb_id++) { if ((pb_id != bearer_id) && n->links[pb_id].link) { pl = n->links[pb_id].link; break; } } if (!tipc_link_validate_msg(l, hdr)) { trace_tipc_skb_dump(skb, false, "PROTO invalid (2)!"); trace_tipc_link_dump(l, TIPC_DUMP_NONE, "PROTO invalid (2)!"); return false; } /* Check and update node accesibility if applicable */ if (state == SELF_UP_PEER_COMING) { if (!tipc_link_is_up(l)) return true; if (!msg_peer_link_is_up(hdr)) return true; tipc_node_fsm_evt(n, PEER_ESTABL_CONTACT_EVT); } if (state == SELF_DOWN_PEER_LEAVING) { if (msg_peer_node_is_up(hdr)) return false; tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT); return true; } if (state == SELF_LEAVING_PEER_DOWN) return false; /* Ignore duplicate packets */ if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt)) return true; /* Initiate or update failover mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) { syncpt = oseqno + exp_pkts - 1; if (pl && !tipc_link_is_reset(pl)) { __tipc_node_link_down(n, &pb_id, xmitq, &maddr); trace_tipc_node_link_down(n, true, "node link down <- failover!"); tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl), tipc_link_inputq(l)); } /* If parallel link was already down, and this happened before * the tunnel link came up, node failover was never started. * Ensure that a FAILOVER_MSG is sent to get peer out of * NODE_FAILINGOVER state, also this node must accept * TUNNEL_MSGs from peer. */ if (n->state != NODE_FAILINGOVER) tipc_node_link_failover(n, pl, l, xmitq); /* If pkts arrive out of order, use lowest calculated syncpt */ if (less(syncpt, n->sync_point)) n->sync_point = syncpt; } /* Open parallel link when tunnel link reaches synch point */ if ((n->state == NODE_FAILINGOVER) && tipc_link_is_up(l)) { if (!more(rcv_nxt, n->sync_point)) return true; tipc_node_fsm_evt(n, NODE_FAILOVER_END_EVT); if (pl) tipc_link_fsm_evt(pl, LINK_FAILOVER_END_EVT); return true; } /* No syncing needed if only one link */ if (!pl || !tipc_link_is_up(pl)) return true; /* Initiate synch mode if applicable */ if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG) && (oseqno == 1)) { if (n->capabilities & TIPC_TUNNEL_ENHANCED) syncpt = msg_syncpt(hdr); else syncpt = msg_seqno(msg_inner_hdr(hdr)) + exp_pkts - 1; if (!tipc_link_is_up(l)) __tipc_node_link_up(n, bearer_id, xmitq); if (n->state == SELF_UP_PEER_UP) { n->sync_point = syncpt; tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT); } } /* Open tunnel link when parallel link reaches synch point */ if (n->state == NODE_SYNCHING) { if (tipc_link_is_synching(l)) { tnl = l; } else { tnl = pl; pl = l; } inputq_len = skb_queue_len(tipc_link_inputq(pl)); dlv_nxt = tipc_link_rcv_nxt(pl) - inputq_len; if (more(dlv_nxt, n->sync_point)) { tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT); tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT); return true; } if (l == pl) return true; if ((usr == TUNNEL_PROTOCOL) && (mtyp == SYNCH_MSG)) return true; if (usr == LINK_PROTOCOL) return true; return false; } return true; } /** * tipc_rcv - process TIPC packets/messages arriving from off-node * @net: the applicable net namespace * @skb: TIPC packet * @b: pointer to bearer message arrived on * * Invoked with no locks held. Bearer pointer must point to a valid bearer * structure (i.e. cannot be NULL), but bearer can be inactive. */ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b) { struct sk_buff_head xmitq; struct tipc_link_entry *le; struct tipc_msg *hdr; struct tipc_node *n; int bearer_id = b->identity; u32 self = tipc_own_addr(net); int usr, rc = 0; u16 bc_ack; #ifdef CONFIG_TIPC_CRYPTO struct tipc_ehdr *ehdr; /* Check if message must be decrypted first */ if (TIPC_SKB_CB(skb)->decrypted || !tipc_ehdr_validate(skb)) goto rcv; ehdr = (struct tipc_ehdr *)skb->data; if (likely(ehdr->user != LINK_CONFIG)) { n = tipc_node_find(net, ntohl(ehdr->addr)); if (unlikely(!n)) goto discard; } else { n = tipc_node_find_by_id(net, ehdr->id); } skb_dst_force(skb); tipc_crypto_rcv(net, (n) ? n->crypto_rx : NULL, &skb, b); if (!skb) return; rcv: #endif /* Ensure message is well-formed before touching the header */ if (unlikely(!tipc_msg_validate(&skb))) goto discard; __skb_queue_head_init(&xmitq); hdr = buf_msg(skb); usr = msg_user(hdr); bc_ack = msg_bcast_ack(hdr); /* Handle arrival of discovery or broadcast packet */ if (unlikely(msg_non_seq(hdr))) { if (unlikely(usr == LINK_CONFIG)) return tipc_disc_rcv(net, skb, b); else return tipc_node_bc_rcv(net, skb, bearer_id); } /* Discard unicast link messages destined for another node */ if (unlikely(!msg_short(hdr) && (msg_destnode(hdr) != self))) goto discard; /* Locate neighboring node that sent packet */ n = tipc_node_find(net, msg_prevnode(hdr)); if (unlikely(!n)) goto discard; le = &n->links[bearer_id]; /* Ensure broadcast reception is in synch with peer's send state */ if (unlikely(usr == LINK_PROTOCOL)) { if (unlikely(skb_linearize(skb))) { tipc_node_put(n); goto discard; } hdr = buf_msg(skb); tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq); } else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) { tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr); } /* Receive packet directly if conditions permit */ tipc_node_read_lock(n); if (likely((n->state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) { spin_lock_bh(&le->lock); if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } spin_unlock_bh(&le->lock); } tipc_node_read_unlock(n); /* Check/update node state before receiving */ if (unlikely(skb)) { if (unlikely(skb_linearize(skb))) goto out_node_put; tipc_node_write_lock(n); if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) { if (le->link) { rc = tipc_link_rcv(le->link, skb, &xmitq); skb = NULL; } } tipc_node_write_unlock(n); } if (unlikely(rc & TIPC_LINK_UP_EVT)) tipc_node_link_up(n, bearer_id, &xmitq); if (unlikely(rc & TIPC_LINK_DOWN_EVT)) tipc_node_link_down(n, bearer_id, false); if (unlikely(!skb_queue_empty(&n->bc_entry.namedq))) tipc_named_rcv(net, &n->bc_entry.namedq, &n->bc_entry.named_rcv_nxt, &n->bc_entry.named_open); if (unlikely(!skb_queue_empty(&n->bc_entry.inputq1))) tipc_node_mcast_rcv(n); if (!skb_queue_empty(&le->inputq)) tipc_sk_rcv(net, &le->inputq); if (!skb_queue_empty(&xmitq)) tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr, n); out_node_put: tipc_node_put(n); discard: kfree_skb(skb); } void tipc_node_apply_property(struct net *net, struct tipc_bearer *b, int prop) { struct tipc_net *tn = tipc_net(net); int bearer_id = b->identity; struct sk_buff_head xmitq; struct tipc_link_entry *e; struct tipc_node *n; __skb_queue_head_init(&xmitq); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_write_lock(n); e = &n->links[bearer_id]; if (e->link) { if (prop == TIPC_NLA_PROP_TOL) tipc_link_set_tolerance(e->link, b->tolerance, &xmitq); else if (prop == TIPC_NLA_PROP_MTU) tipc_link_set_mtu(e->link, b->mtu); /* Update MTU for node link entry */ e->mtu = tipc_link_mss(e->link); } tipc_node_write_unlock(n); tipc_bearer_xmit(net, bearer_id, &xmitq, &e->maddr, NULL); } rcu_read_unlock(); } int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); struct nlattr *attrs[TIPC_NLA_NET_MAX + 1]; struct tipc_node *peer, *temp_node; u8 node_id[NODE_ID_LEN]; u64 *w0 = (u64 *)&node_id[0]; u64 *w1 = (u64 *)&node_id[8]; u32 addr; int err; /* We identify the peer by its net */ if (!info->attrs[TIPC_NLA_NET]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_NET_MAX, info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, info->extack); if (err) return err; /* attrs[TIPC_NLA_NET_NODEID] and attrs[TIPC_NLA_NET_ADDR] are * mutually exclusive cases */ if (attrs[TIPC_NLA_NET_ADDR]) { addr = nla_get_u32(attrs[TIPC_NLA_NET_ADDR]); if (!addr) return -EINVAL; } if (attrs[TIPC_NLA_NET_NODEID]) { if (!attrs[TIPC_NLA_NET_NODEID_W1]) return -EINVAL; *w0 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID]); *w1 = nla_get_u64(attrs[TIPC_NLA_NET_NODEID_W1]); addr = hash128to32(node_id); } if (in_own_node(net, addr)) return -ENOTSUPP; spin_lock_bh(&tn->node_list_lock); peer = tipc_node_find(net, addr); if (!peer) { spin_unlock_bh(&tn->node_list_lock); return -ENXIO; } tipc_node_write_lock(peer); if (peer->state != SELF_DOWN_PEER_DOWN && peer->state != SELF_DOWN_PEER_LEAVING) { tipc_node_write_unlock(peer); err = -EBUSY; goto err_out; } tipc_node_clear_links(peer); tipc_node_write_unlock(peer); tipc_node_delete(peer); /* Calculate cluster capabilities */ tn->capabilities = TIPC_NODE_CAPABILITIES; list_for_each_entry_rcu(temp_node, &tn->node_list, list) { tn->capabilities &= temp_node->capabilities; } tipc_bcast_toggle_rcast(net, (tn->capabilities & TIPC_BCAST_RCAST)); err = 0; err_out: tipc_node_put(peer); spin_unlock_bh(&tn->node_list_lock); return err; } int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb) { int err; struct net *net = sock_net(skb->sk); struct tipc_net *tn = net_generic(net, tipc_net_id); int done = cb->args[0]; int last_addr = cb->args[1]; struct tipc_node *node; struct tipc_nl_msg msg; if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (last_addr) { node = tipc_node_find(net, last_addr); if (!node) { rcu_read_unlock(); /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the NLMSG_DONE message having * the NLM_F_DUMP_INTR flag set if the node state * changed while we released the lock. */ cb->prev_seq = 1; return -EPIPE; } tipc_node_put(node); } list_for_each_entry_rcu(node, &tn->node_list, list) { if (node->preliminary) continue; if (last_addr) { if (node->addr == last_addr) last_addr = 0; else continue; } tipc_node_read_lock(node); err = __tipc_nl_add_node(&msg, node); if (err) { last_addr = node->addr; tipc_node_read_unlock(node); goto out; } tipc_node_read_unlock(node); } done = 1; out: cb->args[0] = done; cb->args[1] = last_addr; rcu_read_unlock(); return skb->len; } /* tipc_node_find_by_name - locate owner node of link by link's name * @net: the applicable net namespace * @name: pointer to link name string * @bearer_id: pointer to index in 'node->links' array where the link was found. * * Returns pointer to node owning the link, or 0 if no matching link is found. */ static struct tipc_node *tipc_node_find_by_name(struct net *net, const char *link_name, unsigned int *bearer_id) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_link *l; struct tipc_node *n; struct tipc_node *found_node = NULL; int i; *bearer_id = 0; rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) { tipc_node_read_lock(n); for (i = 0; i < MAX_BEARERS; i++) { l = n->links[i].link; if (l && !strcmp(tipc_link_name(l), link_name)) { *bearer_id = i; found_node = n; break; } } tipc_node_read_unlock(n); if (found_node) break; } rcu_read_unlock(); return found_node; } int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) { int err; int res = 0; int bearer_id; char *name; struct tipc_link *link; struct tipc_node *node; struct sk_buff_head xmitq; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); __skb_queue_head_init(&xmitq); if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); if (strcmp(name, tipc_bclink_name) == 0) return tipc_nl_bc_link_set(net, attrs); node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) return -EINVAL; tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { res = -EINVAL; goto out; } if (attrs[TIPC_NLA_LINK_PROP]) { struct nlattr *props[TIPC_NLA_PROP_MAX + 1]; err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP], props); if (err) { res = err; goto out; } if (props[TIPC_NLA_PROP_TOL]) { u32 tol; tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]); tipc_link_set_tolerance(link, tol, &xmitq); } if (props[TIPC_NLA_PROP_PRIO]) { u32 prio; prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]); tipc_link_set_prio(link, prio, &xmitq); } if (props[TIPC_NLA_PROP_WIN]) { u32 max_win; max_win = nla_get_u32(props[TIPC_NLA_PROP_WIN]); tipc_link_set_queue_limits(link, tipc_link_min_win(link), max_win); } } out: tipc_node_read_unlock(node); tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr, NULL); return res; } int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct tipc_nl_msg msg; char *name; int err; msg.portid = info->snd_portid; msg.seq = info->snd_seq; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; name = nla_data(attrs[TIPC_NLA_LINK_NAME]); msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; if (strcmp(name, tipc_bclink_name) == 0) { err = tipc_nl_add_bc_link(net, &msg, tipc_net(net)->bcl); if (err) goto err_free; } else { int bearer_id; struct tipc_node *node; struct tipc_link *link; node = tipc_node_find_by_name(net, name, &bearer_id); if (!node) { err = -EINVAL; goto err_free; } tipc_node_read_lock(node); link = node->links[bearer_id].link; if (!link) { tipc_node_read_unlock(node); err = -EINVAL; goto err_free; } err = __tipc_nl_add_link(net, &msg, link, 0); tipc_node_read_unlock(node); if (err) goto err_free; } return genlmsg_reply(msg.skb, info); err_free: nlmsg_free(msg.skb); return err; } int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) { int err; char *link_name; unsigned int bearer_id; struct tipc_link *link; struct tipc_node *node; struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_link_entry *le; if (!info->attrs[TIPC_NLA_LINK]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], tipc_nl_link_policy, info->extack); if (err) return err; if (!attrs[TIPC_NLA_LINK_NAME]) return -EINVAL; link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]); err = -EINVAL; if (!strcmp(link_name, tipc_bclink_name)) { err = tipc_bclink_reset_stats(net, tipc_bc_sndlink(net)); if (err) return err; return 0; } else if (strstr(link_name, tipc_bclink_name)) { rcu_read_lock(); list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); link = node->bc_entry.link; if (link && !strcmp(link_name, tipc_link_name(link))) { err = tipc_bclink_reset_stats(net, link); tipc_node_read_unlock(node); break; } tipc_node_read_unlock(node); } rcu_read_unlock(); return err; } node = tipc_node_find_by_name(net, link_name, &bearer_id); if (!node) return -EINVAL; le = &node->links[bearer_id]; tipc_node_read_lock(node); spin_lock_bh(&le->lock); link = node->links[bearer_id].link; if (!link) { spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return -EINVAL; } tipc_link_reset_stats(link); spin_unlock_bh(&le->lock); tipc_node_read_unlock(node); return 0; } /* Caller should hold node lock */ static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg, struct tipc_node *node, u32 *prev_link, bool bc_link) { u32 i; int err; for (i = *prev_link; i < MAX_BEARERS; i++) { *prev_link = i; if (!node->links[i].link) continue; err = __tipc_nl_add_link(net, msg, node->links[i].link, NLM_F_MULTI); if (err) return err; } if (bc_link) { *prev_link = i; err = tipc_nl_add_bc_link(net, msg, node->bc_entry.link); if (err) return err; } *prev_link = 0; return 0; } int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct nlattr *link[TIPC_NLA_LINK_MAX + 1]; struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_node *node; struct tipc_nl_msg msg; u32 prev_node = cb->args[0]; u32 prev_link = cb->args[1]; int done = cb->args[2]; bool bc_link = cb->args[3]; int err; if (done) return 0; if (!prev_node) { /* Check if broadcast-receiver links dumping is needed */ if (attrs && attrs[TIPC_NLA_LINK]) { err = nla_parse_nested_deprecated(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], tipc_nl_link_policy, NULL); if (unlikely(err)) return err; if (unlikely(!link[TIPC_NLA_LINK_BROADCAST])) return -EINVAL; bc_link = true; } } msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rcu_read_lock(); if (prev_node) { node = tipc_node_find(net, prev_node); if (!node) { /* We never set seq or call nl_dump_check_consistent() * this means that setting prev_seq here will cause the * consistence check to fail in the netlink callback * handler. Resulting in the last NLMSG_DONE message * having the NLM_F_DUMP_INTR flag set. */ cb->prev_seq = 1; goto out; } tipc_node_put(node); list_for_each_entry_continue_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } else { err = tipc_nl_add_bc_link(net, &msg, tn->bcl); if (err) goto out; list_for_each_entry_rcu(node, &tn->node_list, list) { tipc_node_read_lock(node); err = __tipc_nl_add_node_links(net, &msg, node, &prev_link, bc_link); tipc_node_read_unlock(node); if (err) goto out; prev_node = node->addr; } } done = 1; out: rcu_read_unlock(); cb->args[0] = prev_node; cb->args[1] = prev_link; cb->args[2] = done; cb->args[3] = bc_link; return skb->len; } int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_MON_MAX + 1]; struct net *net = sock_net(skb->sk); int err; if (!info->attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(attrs, TIPC_NLA_MON_MAX, info->attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, info->extack); if (err) return err; if (attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]) { u32 val; val = nla_get_u32(attrs[TIPC_NLA_MON_ACTIVATION_THRESHOLD]); err = tipc_nl_monitor_set_threshold(net, val); if (err) return err; } return 0; } static int __tipc_nl_add_monitor_prop(struct net *net, struct tipc_nl_msg *msg) { struct nlattr *attrs; void *hdr; u32 val; hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family, 0, TIPC_NL_MON_GET); if (!hdr) return -EMSGSIZE; attrs = nla_nest_start_noflag(msg->skb, TIPC_NLA_MON); if (!attrs) goto msg_full; val = tipc_nl_monitor_get_threshold(net); if (nla_put_u32(msg->skb, TIPC_NLA_MON_ACTIVATION_THRESHOLD, val)) goto attr_msg_full; nla_nest_end(msg->skb, attrs); genlmsg_end(msg->skb, hdr); return 0; attr_msg_full: nla_nest_cancel(msg->skb, attrs); msg_full: genlmsg_cancel(msg->skb, hdr); return -EMSGSIZE; } int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_nl_msg msg; int err; msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); if (!msg.skb) return -ENOMEM; msg.portid = info->snd_portid; msg.seq = info->snd_seq; err = __tipc_nl_add_monitor_prop(net, &msg); if (err) { nlmsg_free(msg.skb); return err; } return genlmsg_reply(msg.skb, info); } int tipc_nl_node_dump_monitor(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_bearer = cb->args[0]; struct tipc_nl_msg msg; int bearer_id; int err; if (prev_bearer == MAX_BEARERS) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); for (bearer_id = prev_bearer; bearer_id < MAX_BEARERS; bearer_id++) { err = __tipc_nl_add_monitor(net, &msg, bearer_id); if (err) break; } rtnl_unlock(); cb->args[0] = bearer_id; return skb->len; } int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); u32 prev_node = cb->args[1]; u32 bearer_id = cb->args[2]; int done = cb->args[0]; struct tipc_nl_msg msg; int err; if (!prev_node) { struct nlattr **attrs = genl_dumpit_info(cb)->attrs; struct nlattr *mon[TIPC_NLA_MON_MAX + 1]; if (!attrs[TIPC_NLA_MON]) return -EINVAL; err = nla_parse_nested_deprecated(mon, TIPC_NLA_MON_MAX, attrs[TIPC_NLA_MON], tipc_nl_monitor_policy, NULL); if (err) return err; if (!mon[TIPC_NLA_MON_REF]) return -EINVAL; bearer_id = nla_get_u32(mon[TIPC_NLA_MON_REF]); if (bearer_id >= MAX_BEARERS) return -EINVAL; } if (done) return 0; msg.skb = skb; msg.portid = NETLINK_CB(cb->skb).portid; msg.seq = cb->nlh->nlmsg_seq; rtnl_lock(); err = tipc_nl_add_monitor_peer(net, &msg, bearer_id, &prev_node); if (!err) done = 1; rtnl_unlock(); cb->args[0] = done; cb->args[1] = prev_node; cb->args[2] = bearer_id; return skb->len; } #ifdef CONFIG_TIPC_CRYPTO static int tipc_nl_retrieve_key(struct nlattr **attrs, struct tipc_aead_key **pkey) { struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY]; struct tipc_aead_key *key; if (!attr) return -ENODATA; if (nla_len(attr) < sizeof(*key)) return -EINVAL; key = (struct tipc_aead_key *)nla_data(attr); if (key->keylen > TIPC_AEAD_KEYLEN_MAX || nla_len(attr) < tipc_aead_key_size(key)) return -EINVAL; *pkey = key; return 0; } static int tipc_nl_retrieve_nodeid(struct nlattr **attrs, u8 **node_id) { struct nlattr *attr = attrs[TIPC_NLA_NODE_ID]; if (!attr) return -ENODATA; if (nla_len(attr) < TIPC_NODEID_LEN) return -EINVAL; *node_id = (u8 *)nla_data(attr); return 0; } static int tipc_nl_retrieve_rekeying(struct nlattr **attrs, u32 *intv) { struct nlattr *attr = attrs[TIPC_NLA_NODE_REKEYING]; if (!attr) return -ENODATA; *intv = nla_get_u32(attr); return 0; } static int __tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { struct nlattr *attrs[TIPC_NLA_NODE_MAX + 1]; struct net *net = sock_net(skb->sk); struct tipc_crypto *tx = tipc_net(net)->crypto_tx, *c = tx; struct tipc_node *n = NULL; struct tipc_aead_key *ukey; bool rekeying = true, master_key = false; u8 *id, *own_id, mode; u32 intv = 0; int rc = 0; if (!info->attrs[TIPC_NLA_NODE]) return -EINVAL; rc = nla_parse_nested(attrs, TIPC_NLA_NODE_MAX, info->attrs[TIPC_NLA_NODE], tipc_nl_node_policy, info->extack); if (rc) return rc; own_id = tipc_own_id(net); if (!own_id) { GENL_SET_ERR_MSG(info, "not found own node identity (set id?)"); return -EPERM; } rc = tipc_nl_retrieve_rekeying(attrs, &intv); if (rc == -ENODATA) rekeying = false; rc = tipc_nl_retrieve_key(attrs, &ukey); if (rc == -ENODATA && rekeying) goto rekeying; else if (rc) return rc; rc = tipc_aead_key_validate(ukey, info); if (rc) return rc; rc = tipc_nl_retrieve_nodeid(attrs, &id); switch (rc) { case -ENODATA: mode = CLUSTER_KEY; master_key = !!(attrs[TIPC_NLA_NODE_KEY_MASTER]); break; case 0: mode = PER_NODE_KEY; if (memcmp(id, own_id, NODE_ID_LEN)) { n = tipc_node_find_by_id(net, id) ?: tipc_node_create(net, 0, id, 0xffffu, 0, true); if (unlikely(!n)) return -ENOMEM; c = n->crypto_rx; } break; default: return rc; } /* Initiate the TX/RX key */ rc = tipc_crypto_key_init(c, ukey, mode, master_key); if (n) tipc_node_put(n); if (unlikely(rc < 0)) { GENL_SET_ERR_MSG(info, "unable to initiate or attach new key"); return rc; } else if (c == tx) { /* Distribute TX key but not master one */ if (!master_key && tipc_crypto_key_distr(tx, rc, NULL)) GENL_SET_ERR_MSG(info, "failed to replicate new key"); rekeying: /* Schedule TX rekeying if needed */ tipc_crypto_rekeying_sched(tx, rekeying, intv); } return 0; } int tipc_nl_node_set_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_set_key(skb, info); rtnl_unlock(); return err; } static int __tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct tipc_net *tn = tipc_net(net); struct tipc_node *n; tipc_crypto_key_flush(tn->crypto_tx); rcu_read_lock(); list_for_each_entry_rcu(n, &tn->node_list, list) tipc_crypto_key_flush(n->crypto_rx); rcu_read_unlock(); return 0; } int tipc_nl_node_flush_key(struct sk_buff *skb, struct genl_info *info) { int err; rtnl_lock(); err = __tipc_nl_node_flush_key(skb, info); rtnl_unlock(); return err; } #endif /** * tipc_node_dump - dump TIPC node data * @n: tipc node to be dumped * @more: dump more? * - false: dump only tipc node data * - true: dump node link data as well * @buf: returned buffer of dump data in format */ int tipc_node_dump(struct tipc_node *n, bool more, char *buf) { int i = 0; size_t sz = (more) ? NODE_LMAX : NODE_LMIN; if (!n) { i += scnprintf(buf, sz, "node data: (null)\n"); return i; } i += scnprintf(buf, sz, "node data: %x", n->addr); i += scnprintf(buf + i, sz - i, " %x", n->state); i += scnprintf(buf + i, sz - i, " %d", n->active_links[0]); i += scnprintf(buf + i, sz - i, " %d", n->active_links[1]); i += scnprintf(buf + i, sz - i, " %x", n->action_flags); i += scnprintf(buf + i, sz - i, " %u", n->failover_sent); i += scnprintf(buf + i, sz - i, " %u", n->sync_point); i += scnprintf(buf + i, sz - i, " %d", n->link_cnt); i += scnprintf(buf + i, sz - i, " %u", n->working_links); i += scnprintf(buf + i, sz - i, " %x", n->capabilities); i += scnprintf(buf + i, sz - i, " %lu\n", n->keepalive_intv); if (!more) return i; i += scnprintf(buf + i, sz - i, "link_entry[0]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[0].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[0].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[0].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[0].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "link_entry[1]:\n"); i += scnprintf(buf + i, sz - i, " mtu: %u\n", n->links[1].mtu); i += scnprintf(buf + i, sz - i, " media: "); i += tipc_media_addr_printf(buf + i, sz - i, &n->links[1].maddr); i += scnprintf(buf + i, sz - i, "\n"); i += tipc_link_dump(n->links[1].link, TIPC_DUMP_NONE, buf + i); i += scnprintf(buf + i, sz - i, " inputq: "); i += tipc_list_dump(&n->links[1].inputq, false, buf + i); i += scnprintf(buf + i, sz - i, "bclink:\n "); i += tipc_link_dump(n->bc_entry.link, TIPC_DUMP_NONE, buf + i); return i; } void tipc_node_pre_cleanup_net(struct net *exit_net) { struct tipc_node *n; struct tipc_net *tn; struct net *tmp; rcu_read_lock(); for_each_net_rcu(tmp) { if (tmp == exit_net) continue; tn = tipc_net(tmp); if (!tn) continue; spin_lock_bh(&tn->node_list_lock); list_for_each_entry_rcu(n, &tn->node_list, list) { if (!n->peer_net) continue; if (n->peer_net != exit_net) continue; tipc_node_write_lock(n); n->peer_net = NULL; n->peer_hash_mix = 0; tipc_node_write_unlock_fast(n); break; } spin_unlock_bh(&tn->node_list_lock); } rcu_read_unlock(); }
28 4 25 25 25 25 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2007-2014 Nicira, Inc. */ #ifndef DATAPATH_H #define DATAPATH_H 1 #include <asm/page.h> #include <linux/kernel.h> #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/u64_stats_sync.h> #include <net/ip_tunnels.h> #include "conntrack.h" #include "flow.h" #include "flow_table.h" #include "meter.h" #include "vport-internal_dev.h" #define DP_MAX_PORTS USHRT_MAX #define DP_VPORT_HASH_BUCKETS 1024 #define DP_MASKS_REBALANCE_INTERVAL 4000 /** * struct dp_stats_percpu - per-cpu packet processing statistics for a given * datapath. * @n_hit: Number of received packets for which a matching flow was found in * the flow table. * @n_miss: Number of received packets that had no matching flow in the flow * table. The sum of @n_hit and @n_miss is the number of packets that have * been received by the datapath. * @n_lost: Number of received packets that had no matching flow in the flow * table that could not be sent to userspace (normally due to an overflow in * one of the datapath's queues). * @n_mask_hit: Number of masks looked up for flow match. * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked * up per packet. * @n_cache_hit: The number of received packets that had their mask found using * the mask cache. */ struct dp_stats_percpu { u64 n_hit; u64 n_missed; u64 n_lost; u64 n_mask_hit; u64 n_cache_hit; struct u64_stats_sync syncp; }; /** * struct dp_nlsk_pids - array of netlink portids of for a datapath. * This is used when OVS_DP_F_DISPATCH_UPCALL_PER_CPU * is enabled and must be protected by rcu. * @rcu: RCU callback head for deferred destruction. * @n_pids: Size of @pids array. * @pids: Array storing the Netlink socket PIDs indexed by CPU ID for packets * that miss the flow table. */ struct dp_nlsk_pids { struct rcu_head rcu; u32 n_pids; u32 pids[]; }; /** * struct datapath - datapath for flow-based packet switching * @rcu: RCU callback head for deferred destruction. * @list_node: Element in global 'dps' list. * @table: flow table. * @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. * @max_headroom: the maximum headroom of all vports in this datapath; it will * be used by all the internal vports in this dp. * @upcall_portids: RCU protected 'struct dp_nlsk_pids'. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. */ struct datapath { struct rcu_head rcu; struct list_head list_node; /* Flow table. */ struct flow_table table; /* Switch ports. */ struct hlist_head *ports; /* Stats. */ struct dp_stats_percpu __percpu *stats_percpu; /* Network namespace ref. */ possible_net_t net; u32 user_features; u32 max_headroom; /* Switch meters. */ struct dp_meter_table meter_tbl; struct dp_nlsk_pids __rcu *upcall_portids; }; /** * struct ovs_skb_cb - OVS data in skb CB * @input_vport: The original vport packet came in on. This value is cached * when a packet is received by OVS. * @mru: The maximum received fragement size; 0 if the packet is not * fragmented. * @acts_origlen: The netlink size of the flow actions applied to this skb. * @cutlen: The number of bytes from the packet end to be removed. */ struct ovs_skb_cb { struct vport *input_vport; u16 mru; u16 acts_origlen; u32 cutlen; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) /** * struct dp_upcall - metadata to include with a packet to send to userspace * @cmd: One of %OVS_PACKET_CMD_*. * @userdata: If nonnull, its variable-length value is passed to userspace as * %OVS_PACKET_ATTR_USERDATA. * @portid: Netlink portid to which packet should be sent. If @portid is 0 * then no packet is sent and the packet is accounted in the datapath's @n_lost * counter. * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. * @mru: If not zero, Maximum received IP fragment size. */ struct dp_upcall_info { struct ip_tunnel_info *egress_tun_info; const struct nlattr *userdata; const struct nlattr *actions; int actions_len; u32 portid; u8 cmd; u16 mru; }; /** * struct ovs_net - Per net-namespace data for ovs. * @dps: List of datapaths to enable dumping them all out. * Protected by genl_mutex. */ struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; struct delayed_work masks_rebalance; #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) struct ovs_ct_limit_info *ct_limit_info; #endif /* Module reference for configuring conntrack. */ bool xt_label; }; /** * enum ovs_pkt_hash_types - hash info to include with a packet * to send to userspace. * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack. * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash * over transport ports. */ enum ovs_pkt_hash_types { OVS_PACKET_HASH_SW_BIT = (1ULL << 32), OVS_PACKET_HASH_L4_BIT = (1ULL << 33), }; extern unsigned int ovs_net_id; void ovs_lock(void); void ovs_unlock(void); #ifdef CONFIG_LOCKDEP int lockdep_ovsl_is_held(void); #else #define lockdep_ovsl_is_held() 1 #endif #define ASSERT_OVSL() WARN_ON(!lockdep_ovsl_is_held()) #define ovsl_dereference(p) \ rcu_dereference_protected(p, lockdep_ovsl_is_held()) #define rcu_dereference_ovsl(p) \ rcu_dereference_check(p, lockdep_ovsl_is_held()) static inline struct net *ovs_dp_get_net(const struct datapath *dp) { return read_pnet(&dp->net); } static inline void ovs_dp_set_net(struct datapath *dp, struct net *net) { write_pnet(&dp->net, net); } struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no); static inline struct vport *ovs_vport_rcu(const struct datapath *dp, int port_no) { WARN_ON_ONCE(!rcu_read_lock_held()); return ovs_lookup_vport(dp, port_no); } static inline struct vport *ovs_vport_ovsl_rcu(const struct datapath *dp, int port_no) { WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); return ovs_lookup_vport(dp, port_no); } static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_no) { ASSERT_OVSL(); return ovs_lookup_vport(dp, port_no); } /* Must be called with rcu_read_lock. */ static inline struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) { struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); if (dev) { struct vport *vport = ovs_internal_dev_get_vport(dev); if (vport) return vport->dp; } return NULL; } /* The caller must hold either ovs_mutex or rcu_read_lock to keep the * returned dp pointer valid. */ static inline struct datapath *get_dp(struct net *net, int dp_ifindex) { struct datapath *dp; WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); rcu_read_lock(); dp = get_dp_rcu(net, dp_ifindex); rcu_read_unlock(); return dp; } extern struct notifier_block ovs_dp_device_notifier; extern struct genl_family dp_vport_genl_family; DECLARE_STATIC_KEY_FALSE(tc_recirc_sharing_support); void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, const struct sw_flow_key *, const struct dp_upcall_info *, uint32_t cutlen); u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id); const char *ovs_dp_name(const struct datapath *dp); struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, u32 portid, u32 seq, u8 cmd); int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_actions *, struct sw_flow_key *); void ovs_dp_notify_wq(struct work_struct *work); int action_fifos_init(void); void action_fifos_exit(void); /* 'KEY' must not have any bits set outside of the 'MASK' */ #define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) #define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) #define OVS_NLERR(logging_allowed, fmt, ...) \ do { \ if (logging_allowed && net_ratelimit()) \ pr_info("netlink: " fmt "\n", ##__VA_ARGS__); \ } while (0) #endif /* datapath.h */
4 15 13 2 1 3 2 5 4 13 13 3 1 1 1 7 3 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 // SPDX-License-Identifier: GPL-2.0-only /* * File: datagram.c * * Datagram (ISI) Phonet sockets * * Copyright (C) 2008 Nokia Corporation. * * Authors: Sakari Ailus <sakari.ailus@nokia.com> * Rémi Denis-Courmont */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/socket.h> #include <asm/ioctls.h> #include <net/sock.h> #include <linux/phonet.h> #include <linux/export.h> #include <net/phonet/phonet.h> static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb); /* associated socket ceases to exist */ static void pn_sock_close(struct sock *sk, long timeout) { sk_common_release(sk); } static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg) { struct sk_buff *skb; int answ; switch (cmd) { case SIOCINQ: lock_sock(sk); skb = skb_peek(&sk->sk_receive_queue); answ = skb ? skb->len : 0; release_sock(sk); return put_user(answ, (int __user *)arg); case SIOCPNADDRESOURCE: case SIOCPNDELRESOURCE: { u32 res; if (get_user(res, (u32 __user *)arg)) return -EFAULT; if (res >= 256) return -EINVAL; if (cmd == SIOCPNADDRESOURCE) return pn_sock_bind_res(sk, res); else return pn_sock_unbind_res(sk, res); } } return -ENOIOCTLCMD; } /* Destroy socket. All references are gone. */ static void pn_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); } static int pn_init(struct sock *sk) { sk->sk_destruct = pn_destruct; return 0; } static int pn_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { DECLARE_SOCKADDR(struct sockaddr_pn *, target, msg->msg_name); struct sk_buff *skb; int err; if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL| MSG_CMSG_COMPAT)) return -EOPNOTSUPP; if (target == NULL) return -EDESTADDRREQ; if (msg->msg_namelen < sizeof(struct sockaddr_pn)) return -EINVAL; if (target->spn_family != AF_PHONET) return -EAFNOSUPPORT; skb = sock_alloc_send_skb(sk, MAX_PHONET_HEADER + len, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) return err; skb_reserve(skb, MAX_PHONET_HEADER); err = memcpy_from_msg((void *)skb_put(skb, len), msg, len); if (err < 0) { kfree_skb(skb); return err; } /* * Fill in the Phonet header and * finally pass the packet forwards. */ err = pn_skb_send(sk, skb, target); /* If ok, return len. */ return (err >= 0) ? len : err; } static int pn_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len) { struct sk_buff *skb = NULL; struct sockaddr_pn sa; int rval = -EOPNOTSUPP; int copylen; if (flags & ~(MSG_PEEK|MSG_TRUNC|MSG_DONTWAIT|MSG_NOSIGNAL| MSG_CMSG_COMPAT)) goto out_nofree; skb = skb_recv_datagram(sk, flags, noblock, &rval); if (skb == NULL) goto out_nofree; pn_skb_get_src_sockaddr(skb, &sa); copylen = skb->len; if (len < copylen) { msg->msg_flags |= MSG_TRUNC; copylen = len; } rval = skb_copy_datagram_msg(skb, 0, msg, copylen); if (rval) { rval = -EFAULT; goto out; } rval = (flags & MSG_TRUNC) ? skb->len : copylen; if (msg->msg_name != NULL) { __sockaddr_check_size(sizeof(sa)); memcpy(msg->msg_name, &sa, sizeof(sa)); *addr_len = sizeof(sa); } out: skb_free_datagram(sk, skb); out_nofree: return rval; } /* Queue an skb for a sock. */ static int pn_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int err = sock_queue_rcv_skb(sk, skb); if (err < 0) kfree_skb(skb); return err ? NET_RX_DROP : NET_RX_SUCCESS; } /* Module registration */ static struct proto pn_proto = { .close = pn_sock_close, .ioctl = pn_ioctl, .init = pn_init, .sendmsg = pn_sendmsg, .recvmsg = pn_recvmsg, .backlog_rcv = pn_backlog_rcv, .hash = pn_sock_hash, .unhash = pn_sock_unhash, .get_port = pn_sock_get_port, .obj_size = sizeof(struct pn_sock), .owner = THIS_MODULE, .name = "PHONET", }; static const struct phonet_protocol pn_dgram_proto = { .ops = &phonet_dgram_ops, .prot = &pn_proto, .sock_type = SOCK_DGRAM, }; int __init isi_register(void) { return phonet_proto_register(PN_PROTO_PHONET, &pn_dgram_proto); } void __exit isi_unregister(void) { phonet_proto_unregister(PN_PROTO_PHONET, &pn_dgram_proto); }
8 8 8 7 12 12 12 12 12 8 8 8 8 12 11 12 12 12 12 12 8 8 8 8 9 9 9 9 9 9 9 9 9 9 9 9 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich */ #include "send.h" #include "main.h" #include <linux/atomic.h> #include <linux/bug.h> #include <linux/byteorder/generic.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_ether.h> #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/stddef.h> #include <linux/workqueue.h> #include "distributed-arp-table.h" #include "fragmentation.h" #include "gateway_client.h" #include "hard-interface.h" #include "log.h" #include "network-coding.h" #include "originator.h" #include "routing.h" #include "soft-interface.h" #include "translation-table.h" static void batadv_send_outstanding_bcast_packet(struct work_struct *work); /** * batadv_send_skb_packet() - send an already prepared packet * @skb: the packet to send * @hard_iface: the interface to use to send the broadcast packet * @dst_addr: the payload destination * * Send out an already prepared packet to the given neighbor or broadcast it * using the specified interface. Either hard_iface or neigh_node must be not * NULL. * If neigh_node is NULL, then the packet is broadcasted using hard_iface, * otherwise it is sent as unicast to the given neighbor. * * Regardless of the return value, the skb is consumed. * * Return: A negative errno code is returned on a failure. A success does not * guarantee the frame will be transmitted as it may be dropped due * to congestion or traffic shaping. */ int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, const u8 *dst_addr) { struct batadv_priv *bat_priv; struct ethhdr *ethhdr; int ret; bat_priv = netdev_priv(hard_iface->soft_iface); if (hard_iface->if_status != BATADV_IF_ACTIVE) goto send_skb_err; if (unlikely(!hard_iface->net_dev)) goto send_skb_err; if (!(hard_iface->net_dev->flags & IFF_UP)) { pr_warn("Interface %s is not up - can't send packet via that interface!\n", hard_iface->net_dev->name); goto send_skb_err; } /* push to the ethernet header. */ if (batadv_skb_head_push(skb, ETH_HLEN) < 0) goto send_skb_err; skb_reset_mac_header(skb); ethhdr = eth_hdr(skb); ether_addr_copy(ethhdr->h_source, hard_iface->net_dev->dev_addr); ether_addr_copy(ethhdr->h_dest, dst_addr); ethhdr->h_proto = htons(ETH_P_BATMAN); skb_set_network_header(skb, ETH_HLEN); skb->protocol = htons(ETH_P_BATMAN); skb->dev = hard_iface->net_dev; /* Save a clone of the skb to use when decoding coded packets */ batadv_nc_skb_store_for_decoding(bat_priv, skb); /* dev_queue_xmit() returns a negative result on error. However on * congestion and traffic shaping, it drops and returns NET_XMIT_DROP * (which is > 0). This will not be treated as an error. */ ret = dev_queue_xmit(skb); return net_xmit_eval(ret); send_skb_err: kfree_skb(skb); return NET_XMIT_DROP; } /** * batadv_send_broadcast_skb() - Send broadcast packet via hard interface * @skb: packet to be transmitted (with batadv header and no outer eth header) * @hard_iface: outgoing interface * * Return: A negative errno code is returned on a failure. A success does not * guarantee the frame will be transmitted as it may be dropped due * to congestion or traffic shaping. */ int batadv_send_broadcast_skb(struct sk_buff *skb, struct batadv_hard_iface *hard_iface) { return batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr); } /** * batadv_send_unicast_skb() - Send unicast packet to neighbor * @skb: packet to be transmitted (with batadv header and no outer eth header) * @neigh: neighbor which is used as next hop to destination * * Return: A negative errno code is returned on a failure. A success does not * guarantee the frame will be transmitted as it may be dropped due * to congestion or traffic shaping. */ int batadv_send_unicast_skb(struct sk_buff *skb, struct batadv_neigh_node *neigh) { #ifdef CONFIG_BATMAN_ADV_BATMAN_V struct batadv_hardif_neigh_node *hardif_neigh; #endif int ret; ret = batadv_send_skb_packet(skb, neigh->if_incoming, neigh->addr); #ifdef CONFIG_BATMAN_ADV_BATMAN_V hardif_neigh = batadv_hardif_neigh_get(neigh->if_incoming, neigh->addr); if (hardif_neigh && ret != NET_XMIT_DROP) hardif_neigh->bat_v.last_unicast_tx = jiffies; batadv_hardif_neigh_put(hardif_neigh); #endif return ret; } /** * batadv_send_skb_to_orig() - Lookup next-hop and transmit skb. * @skb: Packet to be transmitted. * @orig_node: Final destination of the packet. * @recv_if: Interface used when receiving the packet (can be NULL). * * Looks up the best next-hop towards the passed originator and passes the * skb on for preparation of MAC header. If the packet originated from this * host, NULL can be passed as recv_if and no interface alternating is * attempted. * * Return: negative errno code on a failure, -EINPROGRESS if the skb is * buffered for later transmit or the NET_XMIT status returned by the * lower routine if the packet has been passed down. */ int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, struct batadv_hard_iface *recv_if) { struct batadv_priv *bat_priv = orig_node->bat_priv; struct batadv_neigh_node *neigh_node; int ret; /* batadv_find_router() increases neigh_nodes refcount if found. */ neigh_node = batadv_find_router(bat_priv, orig_node, recv_if); if (!neigh_node) { ret = -EINVAL; goto free_skb; } /* Check if the skb is too large to send in one piece and fragment * it if needed. */ if (atomic_read(&bat_priv->fragmentation) && skb->len > neigh_node->if_incoming->net_dev->mtu) { /* Fragment and send packet. */ ret = batadv_frag_send_packet(skb, orig_node, neigh_node); /* skb was consumed */ skb = NULL; goto put_neigh_node; } /* try to network code the packet, if it is received on an interface * (i.e. being forwarded). If the packet originates from this node or if * network coding fails, then send the packet as usual. */ if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) ret = -EINPROGRESS; else ret = batadv_send_unicast_skb(skb, neigh_node); /* skb was consumed */ skb = NULL; put_neigh_node: batadv_neigh_node_put(neigh_node); free_skb: kfree_skb(skb); return ret; } /** * batadv_send_skb_push_fill_unicast() - extend the buffer and initialize the * common fields for unicast packets * @skb: the skb carrying the unicast header to initialize * @hdr_size: amount of bytes to push at the beginning of the skb * @orig_node: the destination node * * Return: false if the buffer extension was not possible or true otherwise. */ static bool batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size, struct batadv_orig_node *orig_node) { struct batadv_unicast_packet *unicast_packet; u8 ttvn = (u8)atomic_read(&orig_node->last_ttvn); if (batadv_skb_head_push(skb, hdr_size) < 0) return false; unicast_packet = (struct batadv_unicast_packet *)skb->data; unicast_packet->version = BATADV_COMPAT_VERSION; /* batman packet type: unicast */ unicast_packet->packet_type = BATADV_UNICAST; /* set unicast ttl */ unicast_packet->ttl = BATADV_TTL; /* copy the destination for faster routing */ ether_addr_copy(unicast_packet->dest, orig_node->orig); /* set the destination tt version number */ unicast_packet->ttvn = ttvn; return true; } /** * batadv_send_skb_prepare_unicast() - encapsulate an skb with a unicast header * @skb: the skb containing the payload to encapsulate * @orig_node: the destination node * * Return: false if the payload could not be encapsulated or true otherwise. */ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb, struct batadv_orig_node *orig_node) { size_t uni_size = sizeof(struct batadv_unicast_packet); return batadv_send_skb_push_fill_unicast(skb, uni_size, orig_node); } /** * batadv_send_skb_prepare_unicast_4addr() - encapsulate an skb with a * unicast 4addr header * @bat_priv: the bat priv with all the soft interface information * @skb: the skb containing the payload to encapsulate * @orig: the destination node * @packet_subtype: the unicast 4addr packet subtype to use * * Return: false if the payload could not be encapsulated or true otherwise. */ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_orig_node *orig, int packet_subtype) { struct batadv_hard_iface *primary_if; struct batadv_unicast_4addr_packet *uc_4addr_packet; bool ret = false; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; /* Pull the header space and fill the unicast_packet substructure. * We can do that because the first member of the uc_4addr_packet * is of type struct unicast_packet */ if (!batadv_send_skb_push_fill_unicast(skb, sizeof(*uc_4addr_packet), orig)) goto out; uc_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data; uc_4addr_packet->u.packet_type = BATADV_UNICAST_4ADDR; ether_addr_copy(uc_4addr_packet->src, primary_if->net_dev->dev_addr); uc_4addr_packet->subtype = packet_subtype; uc_4addr_packet->reserved = 0; ret = true; out: batadv_hardif_put(primary_if); return ret; } /** * batadv_send_skb_unicast() - encapsulate and send an skb via unicast * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @packet_type: the batman unicast packet type to use * @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast * 4addr packets) * @orig_node: the originator to send the packet to * @vid: the vid to be used to search the translation table * * Wrap the given skb into a batman-adv unicast or unicast-4addr header * depending on whether BATADV_UNICAST or BATADV_UNICAST_4ADDR was supplied * as packet_type. Then send this frame to the given orig_node. * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_unicast(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, int packet_subtype, struct batadv_orig_node *orig_node, unsigned short vid) { struct batadv_unicast_packet *unicast_packet; struct ethhdr *ethhdr; int ret = NET_XMIT_DROP; if (!orig_node) goto out; switch (packet_type) { case BATADV_UNICAST: if (!batadv_send_skb_prepare_unicast(skb, orig_node)) goto out; break; case BATADV_UNICAST_4ADDR: if (!batadv_send_skb_prepare_unicast_4addr(bat_priv, skb, orig_node, packet_subtype)) goto out; break; default: /* this function supports UNICAST and UNICAST_4ADDR only. It * should never be invoked with any other packet type */ goto out; } /* skb->data might have been reallocated by * batadv_send_skb_prepare_unicast{,_4addr}() */ ethhdr = eth_hdr(skb); unicast_packet = (struct batadv_unicast_packet *)skb->data; /* inform the destination node that we are still missing a correct route * for this client. The destination will receive this packet and will * try to reroute it because the ttvn contained in the header is less * than the current one */ if (batadv_tt_global_client_is_roaming(bat_priv, ethhdr->h_dest, vid)) unicast_packet->ttvn = unicast_packet->ttvn - 1; ret = batadv_send_skb_to_orig(skb, orig_node, NULL); /* skb was consumed */ skb = NULL; out: kfree_skb(skb); return ret; } /** * batadv_send_skb_via_tt_generic() - send an skb via TT lookup * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @packet_type: the batman unicast packet type to use * @packet_subtype: the unicast 4addr packet subtype (only relevant for unicast * 4addr packets) * @dst_hint: can be used to override the destination contained in the skb * @vid: the vid to be used to search the translation table * * Look up the recipient node for the destination address in the ethernet * header via the translation table. Wrap the given skb into a batman-adv * unicast or unicast-4addr header depending on whether BATADV_UNICAST or * BATADV_UNICAST_4ADDR was supplied as packet_type. Then send this frame * to the according destination node. * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv, struct sk_buff *skb, int packet_type, int packet_subtype, u8 *dst_hint, unsigned short vid) { struct ethhdr *ethhdr = (struct ethhdr *)skb->data; struct batadv_orig_node *orig_node; u8 *src, *dst; int ret; src = ethhdr->h_source; dst = ethhdr->h_dest; /* if we got an hint! let's send the packet to this client (if any) */ if (dst_hint) { src = NULL; dst = dst_hint; } orig_node = batadv_transtable_search(bat_priv, src, dst, vid); ret = batadv_send_skb_unicast(bat_priv, skb, packet_type, packet_subtype, orig_node, vid); batadv_orig_node_put(orig_node); return ret; } /** * batadv_send_skb_via_gw() - send an skb via gateway lookup * @bat_priv: the bat priv with all the soft interface information * @skb: payload to send * @vid: the vid to be used to search the translation table * * Look up the currently selected gateway. Wrap the given skb into a batman-adv * unicast header and send this frame to this gateway node. * * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise. */ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct batadv_orig_node *orig_node; int ret; orig_node = batadv_gw_get_selected_orig(bat_priv); ret = batadv_send_skb_unicast(bat_priv, skb, BATADV_UNICAST_4ADDR, BATADV_P_DATA, orig_node, vid); batadv_orig_node_put(orig_node); return ret; } /** * batadv_forw_packet_free() - free a forwarding packet * @forw_packet: The packet to free * @dropped: whether the packet is freed because is dropped * * This frees a forwarding packet and releases any resources it might * have claimed. */ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet, bool dropped) { if (dropped) kfree_skb(forw_packet->skb); else consume_skb(forw_packet->skb); batadv_hardif_put(forw_packet->if_incoming); batadv_hardif_put(forw_packet->if_outgoing); if (forw_packet->queue_left) atomic_inc(forw_packet->queue_left); kfree(forw_packet); } /** * batadv_forw_packet_alloc() - allocate a forwarding packet * @if_incoming: The (optional) if_incoming to be grabbed * @if_outgoing: The (optional) if_outgoing to be grabbed * @queue_left: The (optional) queue counter to decrease * @bat_priv: The bat_priv for the mesh of this forw_packet * @skb: The raw packet this forwarding packet shall contain * * Allocates a forwarding packet and tries to get a reference to the * (optional) if_incoming, if_outgoing and queue_left. If queue_left * is NULL then bat_priv is optional, too. * * Return: An allocated forwarding packet on success, NULL otherwise. */ struct batadv_forw_packet * batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, atomic_t *queue_left, struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_forw_packet *forw_packet; const char *qname; if (queue_left && !batadv_atomic_dec_not_zero(queue_left)) { qname = "unknown"; if (queue_left == &bat_priv->bcast_queue_left) qname = "bcast"; if (queue_left == &bat_priv->batman_queue_left) qname = "batman"; batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s queue is full\n", qname); return NULL; } forw_packet = kmalloc(sizeof(*forw_packet), GFP_ATOMIC); if (!forw_packet) goto err; if (if_incoming) kref_get(&if_incoming->refcount); if (if_outgoing) kref_get(&if_outgoing->refcount); INIT_HLIST_NODE(&forw_packet->list); INIT_HLIST_NODE(&forw_packet->cleanup_list); forw_packet->skb = skb; forw_packet->queue_left = queue_left; forw_packet->if_incoming = if_incoming; forw_packet->if_outgoing = if_outgoing; forw_packet->num_packets = 0; return forw_packet; err: if (queue_left) atomic_inc(queue_left); return NULL; } /** * batadv_forw_packet_was_stolen() - check whether someone stole this packet * @forw_packet: the forwarding packet to check * * This function checks whether the given forwarding packet was claimed by * someone else for free(). * * Return: True if someone stole it, false otherwise. */ static bool batadv_forw_packet_was_stolen(struct batadv_forw_packet *forw_packet) { return !hlist_unhashed(&forw_packet->cleanup_list); } /** * batadv_forw_packet_steal() - claim a forw_packet for free() * @forw_packet: the forwarding packet to steal * @lock: a key to the store to steal from (e.g. forw_{bat,bcast}_list_lock) * * This function tries to steal a specific forw_packet from global * visibility for the purpose of getting it for free(). That means * the caller is *not* allowed to requeue it afterwards. * * Return: True if stealing was successful. False if someone else stole it * before us. */ bool batadv_forw_packet_steal(struct batadv_forw_packet *forw_packet, spinlock_t *lock) { /* did purging routine steal it earlier? */ spin_lock_bh(lock); if (batadv_forw_packet_was_stolen(forw_packet)) { spin_unlock_bh(lock); return false; } hlist_del_init(&forw_packet->list); /* Just to spot misuse of this function */ hlist_add_fake(&forw_packet->cleanup_list); spin_unlock_bh(lock); return true; } /** * batadv_forw_packet_list_steal() - claim a list of forward packets for free() * @forw_list: the to be stolen forward packets * @cleanup_list: a backup pointer, to be able to dispose the packet later * @hard_iface: the interface to steal forward packets from * * This function claims responsibility to free any forw_packet queued on the * given hard_iface. If hard_iface is NULL forwarding packets on all hard * interfaces will be claimed. * * The packets are being moved from the forw_list to the cleanup_list. This * makes it possible for already running threads to notice the claim. */ static void batadv_forw_packet_list_steal(struct hlist_head *forw_list, struct hlist_head *cleanup_list, const struct batadv_hard_iface *hard_iface) { struct batadv_forw_packet *forw_packet; struct hlist_node *safe_tmp_node; hlist_for_each_entry_safe(forw_packet, safe_tmp_node, forw_list, list) { /* if purge_outstanding_packets() was called with an argument * we delete only packets belonging to the given interface */ if (hard_iface && forw_packet->if_incoming != hard_iface && forw_packet->if_outgoing != hard_iface) continue; hlist_del(&forw_packet->list); hlist_add_head(&forw_packet->cleanup_list, cleanup_list); } } /** * batadv_forw_packet_list_free() - free a list of forward packets * @head: a list of to be freed forw_packets * * This function cancels the scheduling of any packet in the provided list, * waits for any possibly running packet forwarding thread to finish and * finally, safely frees this forward packet. * * This function might sleep. */ static void batadv_forw_packet_list_free(struct hlist_head *head) { struct batadv_forw_packet *forw_packet; struct hlist_node *safe_tmp_node; hlist_for_each_entry_safe(forw_packet, safe_tmp_node, head, cleanup_list) { cancel_delayed_work_sync(&forw_packet->delayed_work); hlist_del(&forw_packet->cleanup_list); batadv_forw_packet_free(forw_packet, true); } } /** * batadv_forw_packet_queue() - try to queue a forwarding packet * @forw_packet: the forwarding packet to queue * @lock: a key to the store (e.g. forw_{bat,bcast}_list_lock) * @head: the shelve to queue it on (e.g. forw_{bat,bcast}_list) * @send_time: timestamp (jiffies) when the packet is to be sent * * This function tries to (re)queue a forwarding packet. Requeuing * is prevented if the according interface is shutting down * (e.g. if batadv_forw_packet_list_steal() was called for this * packet earlier). * * Calling batadv_forw_packet_queue() after a call to * batadv_forw_packet_steal() is forbidden! * * Caller needs to ensure that forw_packet->delayed_work was initialized. */ static void batadv_forw_packet_queue(struct batadv_forw_packet *forw_packet, spinlock_t *lock, struct hlist_head *head, unsigned long send_time) { spin_lock_bh(lock); /* did purging routine steal it from us? */ if (batadv_forw_packet_was_stolen(forw_packet)) { /* If you got it for free() without trouble, then * don't get back into the queue after stealing... */ WARN_ONCE(hlist_fake(&forw_packet->cleanup_list), "Requeuing after batadv_forw_packet_steal() not allowed!\n"); spin_unlock_bh(lock); return; } hlist_del_init(&forw_packet->list); hlist_add_head(&forw_packet->list, head); queue_delayed_work(batadv_event_workqueue, &forw_packet->delayed_work, send_time - jiffies); spin_unlock_bh(lock); } /** * batadv_forw_packet_bcast_queue() - try to queue a broadcast packet * @bat_priv: the bat priv with all the soft interface information * @forw_packet: the forwarding packet to queue * @send_time: timestamp (jiffies) when the packet is to be sent * * This function tries to (re)queue a broadcast packet. * * Caller needs to ensure that forw_packet->delayed_work was initialized. */ static void batadv_forw_packet_bcast_queue(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet, unsigned long send_time) { batadv_forw_packet_queue(forw_packet, &bat_priv->forw_bcast_list_lock, &bat_priv->forw_bcast_list, send_time); } /** * batadv_forw_packet_ogmv1_queue() - try to queue an OGMv1 packet * @bat_priv: the bat priv with all the soft interface information * @forw_packet: the forwarding packet to queue * @send_time: timestamp (jiffies) when the packet is to be sent * * This function tries to (re)queue an OGMv1 packet. * * Caller needs to ensure that forw_packet->delayed_work was initialized. */ void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet, unsigned long send_time) { batadv_forw_packet_queue(forw_packet, &bat_priv->forw_bat_list_lock, &bat_priv->forw_bat_list, send_time); } /** * batadv_forw_bcast_packet_to_list() - queue broadcast packet for transmissions * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet * @if_in: the interface where the packet was received on * @if_out: the outgoing interface to queue on * * Adds a broadcast packet to the queue and sets up timers. Broadcast packets * are sent multiple times to increase probability for being received. * * This call clones the given skb, hence the caller needs to take into * account that the data segment of the original skb might not be * modifiable anymore. * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ static int batadv_forw_bcast_packet_to_list(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned long delay, bool own_packet, struct batadv_hard_iface *if_in, struct batadv_hard_iface *if_out) { struct batadv_forw_packet *forw_packet; unsigned long send_time = jiffies; struct sk_buff *newskb; newskb = skb_clone(skb, GFP_ATOMIC); if (!newskb) goto err; forw_packet = batadv_forw_packet_alloc(if_in, if_out, &bat_priv->bcast_queue_left, bat_priv, newskb); if (!forw_packet) goto err_packet_free; forw_packet->own = own_packet; INIT_DELAYED_WORK(&forw_packet->delayed_work, batadv_send_outstanding_bcast_packet); send_time += delay ? delay : msecs_to_jiffies(5); batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time); return NETDEV_TX_OK; err_packet_free: kfree_skb(newskb); err: return NETDEV_TX_BUSY; } /** * batadv_forw_bcast_packet_if() - forward and queue a broadcast packet * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet * @if_in: the interface where the packet was received on * @if_out: the outgoing interface to forward to * * Transmits a broadcast packet on the specified interface either immediately * or if a delay is given after that. Furthermore, queues additional * retransmissions if this interface is a wireless one. * * This call clones the given skb, hence the caller needs to take into * account that the data segment of the original skb might not be * modifiable anymore. * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ static int batadv_forw_bcast_packet_if(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned long delay, bool own_packet, struct batadv_hard_iface *if_in, struct batadv_hard_iface *if_out) { unsigned int num_bcasts = if_out->num_bcasts; struct sk_buff *newskb; int ret = NETDEV_TX_OK; if (!delay) { newskb = skb_clone(skb, GFP_ATOMIC); if (!newskb) return NETDEV_TX_BUSY; batadv_send_broadcast_skb(newskb, if_out); num_bcasts--; } /* delayed broadcast or rebroadcasts? */ if (num_bcasts >= 1) { BATADV_SKB_CB(skb)->num_bcasts = num_bcasts; ret = batadv_forw_bcast_packet_to_list(bat_priv, skb, delay, own_packet, if_in, if_out); } return ret; } /** * batadv_send_no_broadcast() - check whether (re)broadcast is necessary * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to check * @own_packet: true if it is a self-generated broadcast packet * @if_out: the outgoing interface checked and considered for (re)broadcast * * Return: False if a packet needs to be (re)broadcasted on the given interface, * true otherwise. */ static bool batadv_send_no_broadcast(struct batadv_priv *bat_priv, struct sk_buff *skb, bool own_packet, struct batadv_hard_iface *if_out) { struct batadv_hardif_neigh_node *neigh_node = NULL; struct batadv_bcast_packet *bcast_packet; u8 *orig_neigh; u8 *neigh_addr; char *type; int ret; if (!own_packet) { neigh_addr = eth_hdr(skb)->h_source; neigh_node = batadv_hardif_neigh_get(if_out, neigh_addr); } bcast_packet = (struct batadv_bcast_packet *)skb->data; orig_neigh = neigh_node ? neigh_node->orig : NULL; ret = batadv_hardif_no_broadcast(if_out, bcast_packet->orig, orig_neigh); batadv_hardif_neigh_put(neigh_node); /* ok, may broadcast */ if (!ret) return false; /* no broadcast */ switch (ret) { case BATADV_HARDIF_BCAST_NORECIPIENT: type = "no neighbor"; break; case BATADV_HARDIF_BCAST_DUPFWD: type = "single neighbor is source"; break; case BATADV_HARDIF_BCAST_DUPORIG: type = "single neighbor is originator"; break; default: type = "unknown"; } batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "BCAST packet from orig %pM on %s suppressed: %s\n", bcast_packet->orig, if_out->net_dev->name, type); return true; } /** * __batadv_forw_bcast_packet() - forward and queue a broadcast packet * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet * * Transmits a broadcast packet either immediately or if a delay is given * after that. Furthermore, queues additional retransmissions on wireless * interfaces. * * This call clones the given skb, hence the caller needs to take into * account that the data segment of the given skb might not be * modifiable anymore. * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ static int __batadv_forw_bcast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned long delay, bool own_packet) { struct batadv_hard_iface *hard_iface; struct batadv_hard_iface *primary_if; int ret = NETDEV_TX_OK; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return NETDEV_TX_BUSY; rcu_read_lock(); list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) { if (hard_iface->soft_iface != bat_priv->soft_iface) continue; if (!kref_get_unless_zero(&hard_iface->refcount)) continue; if (batadv_send_no_broadcast(bat_priv, skb, own_packet, hard_iface)) { batadv_hardif_put(hard_iface); continue; } ret = batadv_forw_bcast_packet_if(bat_priv, skb, delay, own_packet, primary_if, hard_iface); batadv_hardif_put(hard_iface); if (ret == NETDEV_TX_BUSY) break; } rcu_read_unlock(); batadv_hardif_put(primary_if); return ret; } /** * batadv_forw_bcast_packet() - forward and queue a broadcast packet * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet * * Transmits a broadcast packet either immediately or if a delay is given * after that. Furthermore, queues additional retransmissions on wireless * interfaces. * * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors. */ int batadv_forw_bcast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned long delay, bool own_packet) { return __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet); } /** * batadv_send_bcast_packet() - send and queue a broadcast packet * @bat_priv: the bat priv with all the soft interface information * @skb: broadcast packet to add * @delay: number of jiffies to wait before sending * @own_packet: true if it is a self-generated broadcast packet * * Transmits a broadcast packet either immediately or if a delay is given * after that. Furthermore, queues additional retransmissions on wireless * interfaces. * * Consumes the provided skb. */ void batadv_send_bcast_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned long delay, bool own_packet) { __batadv_forw_bcast_packet(bat_priv, skb, delay, own_packet); consume_skb(skb); } /** * batadv_forw_packet_bcasts_left() - check if a retransmission is necessary * @forw_packet: the forwarding packet to check * * Checks whether a given packet has any (re)transmissions left on the provided * interface. * * hard_iface may be NULL: In that case the number of transmissions this skb had * so far is compared with the maximum amount of retransmissions independent of * any interface instead. * * Return: True if (re)transmissions are left, false otherwise. */ static bool batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet) { return BATADV_SKB_CB(forw_packet->skb)->num_bcasts; } /** * batadv_forw_packet_bcasts_dec() - decrement retransmission counter of a * packet * @forw_packet: the packet to decrease the counter for */ static void batadv_forw_packet_bcasts_dec(struct batadv_forw_packet *forw_packet) { BATADV_SKB_CB(forw_packet->skb)->num_bcasts--; } /** * batadv_forw_packet_is_rebroadcast() - check packet for previous transmissions * @forw_packet: the packet to check * * Return: True if this packet was transmitted before, false otherwise. */ bool batadv_forw_packet_is_rebroadcast(struct batadv_forw_packet *forw_packet) { unsigned char num_bcasts = BATADV_SKB_CB(forw_packet->skb)->num_bcasts; return num_bcasts != forw_packet->if_outgoing->num_bcasts; } /** * batadv_send_outstanding_bcast_packet() - transmit a queued broadcast packet * @work: work queue item * * Transmits a queued broadcast packet and if necessary reschedules it. */ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) { unsigned long send_time = jiffies + msecs_to_jiffies(5); struct batadv_forw_packet *forw_packet; struct delayed_work *delayed_work; struct batadv_priv *bat_priv; struct sk_buff *skb1; bool dropped = false; delayed_work = to_delayed_work(work); forw_packet = container_of(delayed_work, struct batadv_forw_packet, delayed_work); bat_priv = netdev_priv(forw_packet->if_incoming->soft_iface); if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING) { dropped = true; goto out; } if (batadv_dat_drop_broadcast_packet(bat_priv, forw_packet)) { dropped = true; goto out; } /* send a copy of the saved skb */ skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC); if (!skb1) goto out; batadv_send_broadcast_skb(skb1, forw_packet->if_outgoing); batadv_forw_packet_bcasts_dec(forw_packet); if (batadv_forw_packet_bcasts_left(forw_packet)) { batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time); return; } out: /* do we get something for free()? */ if (batadv_forw_packet_steal(forw_packet, &bat_priv->forw_bcast_list_lock)) batadv_forw_packet_free(forw_packet, dropped); } /** * batadv_purge_outstanding_packets() - stop/purge scheduled bcast/OGMv1 packets * @bat_priv: the bat priv with all the soft interface information * @hard_iface: the hard interface to cancel and purge bcast/ogm packets on * * This method cancels and purges any broadcast and OGMv1 packet on the given * hard_iface. If hard_iface is NULL, broadcast and OGMv1 packets on all hard * interfaces will be canceled and purged. * * This function might sleep. */ void batadv_purge_outstanding_packets(struct batadv_priv *bat_priv, const struct batadv_hard_iface *hard_iface) { struct hlist_head head = HLIST_HEAD_INIT; if (hard_iface) batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s(): %s\n", __func__, hard_iface->net_dev->name); else batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "%s()\n", __func__); /* claim bcast list for free() */ spin_lock_bh(&bat_priv->forw_bcast_list_lock); batadv_forw_packet_list_steal(&bat_priv->forw_bcast_list, &head, hard_iface); spin_unlock_bh(&bat_priv->forw_bcast_list_lock); /* claim batman packet list for free() */ spin_lock_bh(&bat_priv->forw_bat_list_lock); batadv_forw_packet_list_steal(&bat_priv->forw_bat_list, &head, hard_iface); spin_unlock_bh(&bat_priv->forw_bat_list_lock); /* then cancel or wait for packet workers to finish and free */ batadv_forw_packet_list_free(&head); }
252 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 // SPDX-License-Identifier: GPL-2.0 /* * Creating audit events from TTY input. * * Copyright (C) 2007 Red Hat, Inc. All rights reserved. * * Authors: Miloslav Trmac <mitr@redhat.com> */ #include <linux/audit.h> #include <linux/slab.h> #include <linux/tty.h> #include "tty.h" struct tty_audit_buf { struct mutex mutex; /* Protects all data below */ dev_t dev; /* The TTY which the data is from */ unsigned icanon:1; size_t valid; unsigned char *data; /* Allocated size N_TTY_BUF_SIZE */ }; static struct tty_audit_buf *tty_audit_buf_ref(void) { struct tty_audit_buf *buf; buf = current->signal->tty_audit_buf; WARN_ON(buf == ERR_PTR(-ESRCH)); return buf; } static struct tty_audit_buf *tty_audit_buf_alloc(void) { struct tty_audit_buf *buf; buf = kmalloc(sizeof(*buf), GFP_KERNEL); if (!buf) goto err; buf->data = kmalloc(N_TTY_BUF_SIZE, GFP_KERNEL); if (!buf->data) goto err_buf; mutex_init(&buf->mutex); buf->dev = MKDEV(0, 0); buf->icanon = 0; buf->valid = 0; return buf; err_buf: kfree(buf); err: return NULL; } static void tty_audit_buf_free(struct tty_audit_buf *buf) { WARN_ON(buf->valid != 0); kfree(buf->data); kfree(buf); } static void tty_audit_log(const char *description, dev_t dev, unsigned char *data, size_t size) { struct audit_buffer *ab; pid_t pid = task_pid_nr(current); uid_t uid = from_kuid(&init_user_ns, task_uid(current)); uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current)); unsigned int sessionid = audit_get_sessionid(current); ab = audit_log_start(audit_context(), GFP_KERNEL, AUDIT_TTY); if (ab) { char name[sizeof(current->comm)]; audit_log_format(ab, "%s pid=%u uid=%u auid=%u ses=%u major=%d" " minor=%d comm=", description, pid, uid, loginuid, sessionid, MAJOR(dev), MINOR(dev)); get_task_comm(name, current); audit_log_untrustedstring(ab, name); audit_log_format(ab, " data="); audit_log_n_hex(ab, data, size); audit_log_end(ab); } } /* * tty_audit_buf_push - Push buffered data out * * Generate an audit message from the contents of @buf, which is owned by * the current task. @buf->mutex must be locked. */ static void tty_audit_buf_push(struct tty_audit_buf *buf) { if (buf->valid == 0) return; if (audit_enabled == AUDIT_OFF) { buf->valid = 0; return; } tty_audit_log("tty", buf->dev, buf->data, buf->valid); buf->valid = 0; } /** * tty_audit_exit - Handle a task exit * * Make sure all buffered data is written out and deallocate the buffer. * Only needs to be called if current->signal->tty_audit_buf != %NULL. * * The process is single-threaded at this point; no other threads share * current->signal. */ void tty_audit_exit(void) { struct tty_audit_buf *buf; buf = xchg(&current->signal->tty_audit_buf, ERR_PTR(-ESRCH)); if (!buf) return; tty_audit_buf_push(buf); tty_audit_buf_free(buf); } /* * tty_audit_fork - Copy TTY audit state for a new task * * Set up TTY audit state in @sig from current. @sig needs no locking. */ void tty_audit_fork(struct signal_struct *sig) { sig->audit_tty = current->signal->audit_tty; } /* * tty_audit_tiocsti - Log TIOCSTI */ void tty_audit_tiocsti(struct tty_struct *tty, char ch) { dev_t dev; dev = MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; if (tty_audit_push()) return; if (audit_enabled) tty_audit_log("ioctl=TIOCSTI", dev, &ch, 1); } /* * tty_audit_push - Flush current's pending audit data * * Returns 0 if success, -EPERM if tty audit is disabled */ int tty_audit_push(void) { struct tty_audit_buf *buf; if (~current->signal->audit_tty & AUDIT_TTY_ENABLE) return -EPERM; buf = tty_audit_buf_ref(); if (!IS_ERR_OR_NULL(buf)) { mutex_lock(&buf->mutex); tty_audit_buf_push(buf); mutex_unlock(&buf->mutex); } return 0; } /* * tty_audit_buf_get - Get an audit buffer. * * Get an audit buffer, allocate it if necessary. Return %NULL * if out of memory or ERR_PTR(-ESRCH) if tty_audit_exit() has already * occurred. Otherwise, return a new reference to the buffer. */ static struct tty_audit_buf *tty_audit_buf_get(void) { struct tty_audit_buf *buf; buf = tty_audit_buf_ref(); if (buf) return buf; buf = tty_audit_buf_alloc(); if (buf == NULL) { audit_log_lost("out of memory in TTY auditing"); return NULL; } /* Race to use this buffer, free it if another wins */ if (cmpxchg(&current->signal->tty_audit_buf, NULL, buf) != NULL) tty_audit_buf_free(buf); return tty_audit_buf_ref(); } /* * tty_audit_add_data - Add data for TTY auditing. * * Audit @data of @size from @tty, if necessary. */ void tty_audit_add_data(struct tty_struct *tty, const void *data, size_t size) { struct tty_audit_buf *buf; unsigned int icanon = !!L_ICANON(tty); unsigned int audit_tty; dev_t dev; audit_tty = READ_ONCE(current->signal->audit_tty); if (~audit_tty & AUDIT_TTY_ENABLE) return; if (unlikely(size == 0)) return; if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) return; if ((~audit_tty & AUDIT_TTY_LOG_PASSWD) && icanon && !L_ECHO(tty)) return; buf = tty_audit_buf_get(); if (IS_ERR_OR_NULL(buf)) return; mutex_lock(&buf->mutex); dev = MKDEV(tty->driver->major, tty->driver->minor_start) + tty->index; if (buf->dev != dev || buf->icanon != icanon) { tty_audit_buf_push(buf); buf->dev = dev; buf->icanon = icanon; } do { size_t run; run = N_TTY_BUF_SIZE - buf->valid; if (run > size) run = size; memcpy(buf->data + buf->valid, data, run); buf->valid += run; data += run; size -= run; if (buf->valid == N_TTY_BUF_SIZE) tty_audit_buf_push(buf); } while (size != 0); mutex_unlock(&buf->mutex); }
4481 157 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UACCESS_H #define _ASM_X86_UACCESS_H /* * User space memory access functions */ #include <linux/compiler.h> #include <linux/kasan-checks.h> #include <linux/string.h> #include <asm/asm.h> #include <asm/page.h> #include <asm/smap.h> #include <asm/extable.h> /* * Test whether a block of memory is a valid user space address. * Returns 0 if the range is valid, nonzero otherwise. */ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit) { /* * If we have used "sizeof()" for the size, * we know it won't overflow the limit (but * it might overflow the 'addr', so it's * important to subtract the size from the * limit, not add it to the address). */ if (__builtin_constant_p(size)) return unlikely(addr > limit - size); /* Arbitrary sizes? Be careful about overflow */ addr += size; if (unlikely(addr < size)) return true; return unlikely(addr > limit); } #define __range_not_ok(addr, size, limit) \ ({ \ __chk_user_ptr(addr); \ __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ }) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline bool pagefault_disabled(void); # define WARN_ON_IN_IRQ() \ WARN_ON_ONCE(!in_task() && !pagefault_disabled()) #else # define WARN_ON_IN_IRQ() #endif /** * access_ok - Checks if a user space pointer is valid * @addr: User space pointer to start of block to check * @size: Size of block to check * * Context: User context only. This function may sleep if pagefaults are * enabled. * * Checks if a pointer to a block of memory in user space is valid. * * Note that, depending on architecture, this function probably just * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. * * Return: true (nonzero) if the memory block may be valid, false (zero) * if it is definitely invalid. */ #define access_ok(addr, size) \ ({ \ WARN_ON_IN_IRQ(); \ likely(!__range_not_ok(addr, size, TASK_SIZE_MAX)); \ }) extern int __get_user_1(void); extern int __get_user_2(void); extern int __get_user_4(void); extern int __get_user_8(void); extern int __get_user_nocheck_1(void); extern int __get_user_nocheck_2(void); extern int __get_user_nocheck_4(void); extern int __get_user_nocheck_8(void); extern int __get_user_bad(void); #define __uaccess_begin() stac() #define __uaccess_end() clac() #define __uaccess_begin_nospec() \ ({ \ stac(); \ barrier_nospec(); \ }) /* * This is the smallest unsigned integer type that can fit a value * (up to 'long long') */ #define __inttype(x) __typeof__( \ __typefits(x,char, \ __typefits(x,short, \ __typefits(x,int, \ __typefits(x,long,0ULL))))) #define __typefits(x,type,not) \ __builtin_choose_expr(sizeof(x)<=sizeof(type),(unsigned type)0,not) /* * This is used for both get_user() and __get_user() to expand to * the proper special function call that has odd calling conventions * due to returning both a value and an error, and that depends on * the size of the pointer passed in. * * Careful: we have to cast the result to the type of the pointer * for sign reasons. * * The use of _ASM_DX as the register specifier is a bit of a * simplification, as gcc only cares about it as the starting point * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits * (%ecx being the next register in gcc's x86 register sequence), and * %rdx on 64 bits. * * Clang/LLVM cares about the size of the register, but still wants * the base register for something that ends up being a pair. */ #define do_get_user_call(fn,x,ptr) \ ({ \ int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ asm volatile("call __" #fn "_%P4" \ : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ }) /** * get_user - Get a simple variable from user space. * @x: Variable to store result. * @ptr: Source address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and the result of * dereferencing @ptr must be assignable to @x without a cast. * * Return: zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ #define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); }) /** * __get_user - Get a simple variable from user space, with less checking. * @x: Variable to store result. * @ptr: Source address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and the result of * dereferencing @ptr must be assignable to @x without a cast. * * Caller must check the pointer with access_ok() before calling this * function. * * Return: zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ #define __get_user(x,ptr) do_get_user_call(get_user_nocheck,x,ptr) #ifdef CONFIG_X86_32 #define __put_user_goto_u64(x, addr, label) \ asm_volatile_goto("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ _ASM_EXTABLE_UA(1b, %l2) \ _ASM_EXTABLE_UA(2b, %l2) \ : : "A" (x), "r" (addr) \ : : label) #else #define __put_user_goto_u64(x, ptr, label) \ __put_user_goto(x, ptr, "q", "er", label) #endif extern void __put_user_bad(void); /* * Strange magic calling convention: pointer in %ecx, * value in %eax(:%edx), return value in %ecx. clobbers %rbx */ extern void __put_user_1(void); extern void __put_user_2(void); extern void __put_user_4(void); extern void __put_user_8(void); extern void __put_user_nocheck_1(void); extern void __put_user_nocheck_2(void); extern void __put_user_nocheck_4(void); extern void __put_user_nocheck_8(void); /* * ptr must be evaluated and assigned to the temporary __ptr_pu before * the assignment of x to __val_pu, to avoid any function calls * involved in the ptr expression (possibly implicitly generated due * to KASAN) from clobbering %ax. */ #define do_put_user_call(fn,x,ptr) \ ({ \ int __ret_pu; \ void __user *__ptr_pu; \ register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \ __chk_user_ptr(ptr); \ __ptr_pu = (ptr); \ __val_pu = (x); \ asm volatile("call __" #fn "_%P[size]" \ : "=c" (__ret_pu), \ ASM_CALL_CONSTRAINT \ : "0" (__ptr_pu), \ "r" (__val_pu), \ [size] "i" (sizeof(*(ptr))) \ :"ebx"); \ __builtin_expect(__ret_pu, 0); \ }) /** * put_user - Write a simple value into user space. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable * to the result of dereferencing @ptr. * * Return: zero on success, or -EFAULT on error. */ #define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); }) /** * __put_user - Write a simple value into user space, with less checking. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable * to the result of dereferencing @ptr. * * Caller must check the pointer with access_ok() before calling this * function. * * Return: zero on success, or -EFAULT on error. */ #define __put_user(x, ptr) do_put_user_call(put_user_nocheck,x,ptr) #define __put_user_size(x, ptr, size, label) \ do { \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ __put_user_goto(x, ptr, "b", "iq", label); \ break; \ case 2: \ __put_user_goto(x, ptr, "w", "ir", label); \ break; \ case 4: \ __put_user_goto(x, ptr, "l", "ir", label); \ break; \ case 8: \ __put_user_goto_u64(x, ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, label) do { \ unsigned int __gu_low, __gu_high; \ const unsigned int __user *__gu_ptr; \ __gu_ptr = (const void __user *)(ptr); \ __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label); \ __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label); \ (x) = ((unsigned long long)__gu_high << 32) | __gu_low; \ } while (0) #else #define __get_user_asm_u64(x, ptr, label) \ __get_user_asm(x, ptr, "q", "=r", label) #endif #define __get_user_size(x, ptr, size, label) \ do { \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: { \ unsigned char x_u8__; \ __get_user_asm(x_u8__, ptr, "b", "=q", label); \ (x) = x_u8__; \ break; \ } \ case 2: \ __get_user_asm(x, ptr, "w", "=r", label); \ break; \ case 4: \ __get_user_asm(x, ptr, "l", "=r", label); \ break; \ case 8: \ __get_user_asm_u64(x, ptr, label); \ break; \ default: \ (x) = __get_user_bad(); \ } \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ asm_volatile_goto("\n" \ "1: mov"itype" %[umem],%[output]\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : [output] ltype(x) \ : [umem] "m" (__m(addr)) \ : : label) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, retval) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ asm volatile("\n" \ "1: movl %[lowbits],%%eax\n" \ "2: movl %[highbits],%%edx\n" \ "3:\n" \ ".section .fixup,\"ax\"\n" \ "4: mov %[efault],%[errout]\n" \ " xorl %%eax,%%eax\n" \ " xorl %%edx,%%edx\n" \ " jmp 3b\n" \ ".previous\n" \ _ASM_EXTABLE_UA(1b, 4b) \ _ASM_EXTABLE_UA(2b, 4b) \ : [errout] "=r" (retval), \ [output] "=&A"(x) \ : [lowbits] "m" (__m(__ptr)), \ [highbits] "m" __m(((u32 __user *)(__ptr)) + 1), \ [efault] "i" (-EFAULT), "0" (retval)); \ }) #else #define __get_user_asm_u64(x, ptr, retval) \ __get_user_asm(x, ptr, retval, "q", "=r") #endif #define __get_user_size(x, ptr, size, retval) \ do { \ unsigned char x_u8__; \ \ retval = 0; \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ __get_user_asm(x_u8__, ptr, retval, "b", "=q"); \ (x) = x_u8__; \ break; \ case 2: \ __get_user_asm(x, ptr, retval, "w", "=r"); \ break; \ case 4: \ __get_user_asm(x, ptr, retval, "l", "=r"); \ break; \ case 8: \ __get_user_asm_u64(x, ptr, retval); \ break; \ default: \ (x) = __get_user_bad(); \ } \ } while (0) #define __get_user_asm(x, addr, err, itype, ltype) \ asm volatile("\n" \ "1: mov"itype" %[umem],%[output]\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %[efault],%[errout]\n" \ " xorl %k[output],%k[output]\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE_UA(1b, 3b) \ : [errout] "=r" (err), \ [output] ltype(x) \ : [umem] "m" (__m(addr)), \ [efault] "i" (-EFAULT), "0" (err)) #endif // CONFIG_CC_ASM_GOTO_OUTPUT #ifdef CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT #define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \ bool success; \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ asm_volatile_goto("\n" \ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ [ptr] "+m" (*_ptr), \ [old] "+a" (__old) \ : [new] ltype (__new) \ : "memory" \ : label); \ if (unlikely(!success)) \ *_old = __old; \ likely(success); }) #ifdef CONFIG_X86_32 #define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \ bool success; \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ asm_volatile_goto("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ _ASM_EXTABLE_UA(1b, %l[label]) \ : CC_OUT(z) (success), \ "+A" (__old), \ [ptr] "+m" (*_ptr) \ : "b" ((u32)__new), \ "c" ((u32)((u64)__new >> 32)) \ : "memory" \ : label); \ if (unlikely(!success)) \ *_old = __old; \ likely(success); }) #endif // CONFIG_X86_32 #else // !CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT #define __try_cmpxchg_user_asm(itype, ltype, _ptr, _pold, _new, label) ({ \ int __err = 0; \ bool success; \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ asm volatile("\n" \ "1: " LOCK_PREFIX "cmpxchg"itype" %[new], %[ptr]\n"\ CC_SET(z) \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \ %[errout]) \ : CC_OUT(z) (success), \ [errout] "+r" (__err), \ [ptr] "+m" (*_ptr), \ [old] "+a" (__old) \ : [new] ltype (__new) \ : "memory"); \ if (unlikely(__err)) \ goto label; \ if (unlikely(!success)) \ *_old = __old; \ likely(success); }) #ifdef CONFIG_X86_32 /* * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error. * There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are * hardcoded by CMPXCHG8B, leaving only ESI and EDI. If the compiler uses * both ESI and EDI for the memory operand, compilation will fail if the error * is an input+output as there will be no register available for input. */ #define __try_cmpxchg64_user_asm(_ptr, _pold, _new, label) ({ \ int __result; \ __typeof__(_ptr) _old = (__typeof__(_ptr))(_pold); \ __typeof__(*(_ptr)) __old = *_old; \ __typeof__(*(_ptr)) __new = (_new); \ asm volatile("\n" \ "1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \ "mov $0, %%ecx\n\t" \ "setz %%cl\n" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \ : [result]"=c" (__result), \ "+A" (__old), \ [ptr] "+m" (*_ptr) \ : "b" ((u32)__new), \ "c" ((u32)((u64)__new >> 32)) \ : "memory", "cc"); \ if (unlikely(__result < 0)) \ goto label; \ if (unlikely(!__result)) \ *_old = __old; \ likely(__result); }) #endif // CONFIG_X86_32 #endif // CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT /* FIXME: this hack is definitely wrong -AK */ struct __large_struct { unsigned long buf[100]; }; #define __m(x) (*(struct __large_struct __user *)(x)) /* * Tell gcc we read from memory instead of writing: this is because * we do not write to any memory gcc knows about, so there are no * aliasing issues. */ #define __put_user_goto(x, addr, itype, ltype, label) \ asm_volatile_goto("\n" \ "1: mov"itype" %0,%1\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : : ltype(x), "m" (__m(addr)) \ : : label) extern unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n); extern __must_check long strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); unsigned long __must_check clear_user(void __user *mem, unsigned long len); unsigned long __must_check __clear_user(void __user *mem, unsigned long len); #ifdef CONFIG_ARCH_HAS_COPY_MC unsigned long __must_check copy_mc_to_kernel(void *to, const void *from, unsigned len); #define copy_mc_to_kernel copy_mc_to_kernel unsigned long __must_check copy_mc_to_user(void __user *to, const void *from, unsigned len); #endif /* * movsl can be slow when source and dest are not both 8-byte aligned */ #ifdef CONFIG_X86_INTEL_USERCOPY extern struct movsl_mask { int mask; } ____cacheline_aligned_in_smp movsl_mask; #endif #define ARCH_HAS_NOCACHE_UACCESS 1 #ifdef CONFIG_X86_32 # include <asm/uaccess_32.h> #else # include <asm/uaccess_64.h> #endif /* * The "unsafe" user accesses aren't really "unsafe", but the naming * is a big fat warning: you have to not only do the access_ok() * checking before using them, but you have to surround them with the * user_access_begin/end() pair. */ static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) { if (unlikely(!access_ok(ptr,len))) return 0; __uaccess_begin_nospec(); return 1; } #define user_access_begin(a,b) user_access_begin(a,b) #define user_access_end() __uaccess_end() #define user_access_save() smap_save() #define user_access_restore(x) smap_restore(x) #define unsafe_put_user(x, ptr, label) \ __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define unsafe_get_user(x, ptr, err_label) \ do { \ __inttype(*(ptr)) __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define unsafe_get_user(x, ptr, err_label) \ do { \ int __gu_err; \ __inttype(*(ptr)) __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ if (unlikely(__gu_err)) goto err_label; \ } while (0) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT extern void __try_cmpxchg_user_wrong_size(void); #ifndef CONFIG_X86_32 #define __try_cmpxchg64_user_asm(_ptr, _oldp, _nval, _label) \ __try_cmpxchg_user_asm("q", "r", (_ptr), (_oldp), (_nval), _label) #endif /* * Force the pointer to u<size> to match the size expected by the asm helper. * clang/LLVM compiles all cases and only discards the unused paths after * processing errors, which breaks i386 if the pointer is an 8-byte value. */ #define unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \ bool __ret; \ __chk_user_ptr(_ptr); \ switch (sizeof(*(_ptr))) { \ case 1: __ret = __try_cmpxchg_user_asm("b", "q", \ (__force u8 *)(_ptr), (_oldp), \ (_nval), _label); \ break; \ case 2: __ret = __try_cmpxchg_user_asm("w", "r", \ (__force u16 *)(_ptr), (_oldp), \ (_nval), _label); \ break; \ case 4: __ret = __try_cmpxchg_user_asm("l", "r", \ (__force u32 *)(_ptr), (_oldp), \ (_nval), _label); \ break; \ case 8: __ret = __try_cmpxchg64_user_asm((__force u64 *)(_ptr), (_oldp),\ (_nval), _label); \ break; \ default: __try_cmpxchg_user_wrong_size(); \ } \ __ret; }) /* "Returns" 0 on success, 1 on failure, -EFAULT if the access faults. */ #define __try_cmpxchg_user(_ptr, _oldp, _nval, _label) ({ \ int __ret = -EFAULT; \ __uaccess_begin_nospec(); \ __ret = !unsafe_try_cmpxchg_user(_ptr, _oldp, _nval, _label); \ _label: \ __uaccess_end(); \ __ret; \ }) /* * We want the unsafe accessors to always be inlined and use * the error labels - thus the macro games. */ #define unsafe_copy_loop(dst, src, len, type, label) \ while (len >= sizeof(type)) { \ unsafe_put_user(*(type *)(src),(type __user *)(dst),label); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ } #define unsafe_copy_to_user(_dst,_src,_len,label) \ do { \ char __user *__ucu_dst = (_dst); \ const char *__ucu_src = (_src); \ size_t __ucu_len = (_len); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ } while (0) #define HAVE_GET_KERNEL_NOFAULT #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __get_kernel_nofault(dst, src, type, err_label) \ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ sizeof(type), err_label) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ int __kr_err; \ \ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ sizeof(type), __kr_err); \ if (unlikely(__kr_err)) \ goto err_label; \ } while (0) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __put_kernel_nofault(dst, src, type, err_label) \ __put_user_size(*((type *)(src)), (__force type __user *)(dst), \ sizeof(type), err_label) #endif /* _ASM_X86_UACCESS_H */
39 43 43 30 13 13 4 11 43 9 43 43 10 34 43 43 1 42 29 17 29 42 17 9 30 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 // SPDX-License-Identifier: GPL-2.0-or-later /* RxRPC packet transmission * * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/net.h> #include <linux/gfp.h> #include <linux/skbuff.h> #include <linux/export.h> #include <net/sock.h> #include <net/af_rxrpc.h> #include "ar-internal.h" struct rxrpc_ack_buffer { struct rxrpc_wire_header whdr; struct rxrpc_ackpacket ack; u8 acks[255]; u8 pad[3]; struct rxrpc_ackinfo ackinfo; }; struct rxrpc_abort_buffer { struct rxrpc_wire_header whdr; __be32 abort_code; }; static const char rxrpc_keepalive_string[] = ""; /* * Increase Tx backoff on transmission failure and clear it on success. */ static void rxrpc_tx_backoff(struct rxrpc_call *call, int ret) { if (ret < 0) { u16 tx_backoff = READ_ONCE(call->tx_backoff); if (tx_backoff < HZ) WRITE_ONCE(call->tx_backoff, tx_backoff + 1); } else { WRITE_ONCE(call->tx_backoff, 0); } } /* * Arrange for a keepalive ping a certain time after we last transmitted. This * lets the far side know we're still interested in this call and helps keep * the route through any intervening firewall open. * * Receiving a response to the ping will prevent the ->expect_rx_by timer from * expiring. */ static void rxrpc_set_keepalive(struct rxrpc_call *call) { unsigned long now = jiffies, keepalive_at = call->next_rx_timo / 6; keepalive_at += now; WRITE_ONCE(call->keepalive_at, keepalive_at); rxrpc_reduce_call_timer(call, keepalive_at, now, rxrpc_timer_set_for_keepalive); } /* * Fill out an ACK packet. */ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, struct rxrpc_call *call, struct rxrpc_ack_buffer *pkt, rxrpc_seq_t *_hard_ack, rxrpc_seq_t *_top, u8 reason) { rxrpc_serial_t serial; unsigned int tmp; rxrpc_seq_t hard_ack, top, seq; int ix; u32 mtu, jmax; u8 *ackp = pkt->acks; tmp = atomic_xchg(&call->ackr_nr_unacked, 0); tmp |= atomic_xchg(&call->ackr_nr_consumed, 0); if (!tmp && (reason == RXRPC_ACK_DELAY || reason == RXRPC_ACK_IDLE)) return 0; /* Barrier against rxrpc_input_data(). */ serial = call->ackr_serial; hard_ack = READ_ONCE(call->rx_hard_ack); top = smp_load_acquire(&call->rx_top); *_hard_ack = hard_ack; *_top = top; pkt->ack.bufferSpace = htons(0); pkt->ack.maxSkew = htons(0); pkt->ack.firstPacket = htonl(hard_ack + 1); pkt->ack.previousPacket = htonl(call->ackr_highest_seq); pkt->ack.serial = htonl(serial); pkt->ack.reason = reason; pkt->ack.nAcks = top - hard_ack; if (reason == RXRPC_ACK_PING) pkt->whdr.flags |= RXRPC_REQUEST_ACK; if (after(top, hard_ack)) { seq = hard_ack + 1; do { ix = seq & RXRPC_RXTX_BUFF_MASK; if (call->rxtx_buffer[ix]) *ackp++ = RXRPC_ACK_TYPE_ACK; else *ackp++ = RXRPC_ACK_TYPE_NACK; seq++; } while (before_eq(seq, top)); } mtu = conn->params.peer->if_mtu; mtu -= conn->params.peer->hdrsize; jmax = (call->nr_jumbo_bad > 3) ? 1 : rxrpc_rx_jumbo_max; pkt->ackinfo.rxMTU = htonl(rxrpc_rx_mtu); pkt->ackinfo.maxMTU = htonl(mtu); pkt->ackinfo.rwind = htonl(call->rx_winsize); pkt->ackinfo.jumbo_max = htonl(jmax); *ackp++ = 0; *ackp++ = 0; *ackp++ = 0; return top - hard_ack + 3; } /* * Record the beginning of an RTT probe. */ static int rxrpc_begin_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, enum rxrpc_rtt_tx_trace why) { unsigned long avail = call->rtt_avail; int rtt_slot = 9; if (!(avail & RXRPC_CALL_RTT_AVAIL_MASK)) goto no_slot; rtt_slot = __ffs(avail & RXRPC_CALL_RTT_AVAIL_MASK); if (!test_and_clear_bit(rtt_slot, &call->rtt_avail)) goto no_slot; call->rtt_serial[rtt_slot] = serial; call->rtt_sent_at[rtt_slot] = ktime_get_real(); smp_wmb(); /* Write data before avail bit */ set_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); trace_rxrpc_rtt_tx(call, why, rtt_slot, serial); return rtt_slot; no_slot: trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_no_slot, rtt_slot, serial); return -1; } /* * Cancel an RTT probe. */ static void rxrpc_cancel_rtt_probe(struct rxrpc_call *call, rxrpc_serial_t serial, int rtt_slot) { if (rtt_slot != -1) { clear_bit(rtt_slot + RXRPC_CALL_RTT_PEND_SHIFT, &call->rtt_avail); smp_wmb(); /* Clear pending bit before setting slot */ set_bit(rtt_slot, &call->rtt_avail); trace_rxrpc_rtt_tx(call, rxrpc_rtt_tx_cancel, rtt_slot, serial); } } /* * Send an ACK call packet. */ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, rxrpc_serial_t *_serial) { struct rxrpc_connection *conn; struct rxrpc_ack_buffer *pkt; struct msghdr msg; struct kvec iov[2]; rxrpc_serial_t serial; rxrpc_seq_t hard_ack, top; size_t len, n; int ret, rtt_slot = -1; u8 reason; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return -ECONNRESET; pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); if (!pkt) return -ENOMEM; conn = call->conn; msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; pkt->whdr.epoch = htonl(conn->proto.epoch); pkt->whdr.cid = htonl(call->cid); pkt->whdr.callNumber = htonl(call->call_id); pkt->whdr.seq = 0; pkt->whdr.type = RXRPC_PACKET_TYPE_ACK; pkt->whdr.flags = RXRPC_SLOW_START_OK | conn->out_clientflag; pkt->whdr.userStatus = 0; pkt->whdr.securityIndex = call->security_ix; pkt->whdr._rsvd = 0; pkt->whdr.serviceId = htons(call->service_id); spin_lock_bh(&call->lock); if (ping) { reason = RXRPC_ACK_PING; } else { reason = call->ackr_reason; if (!call->ackr_reason) { spin_unlock_bh(&call->lock); ret = 0; goto out; } call->ackr_reason = 0; } n = rxrpc_fill_out_ack(conn, call, pkt, &hard_ack, &top, reason); spin_unlock_bh(&call->lock); if (n == 0) { kfree(pkt); return 0; } iov[0].iov_base = pkt; iov[0].iov_len = sizeof(pkt->whdr) + sizeof(pkt->ack) + n; iov[1].iov_base = &pkt->ackinfo; iov[1].iov_len = sizeof(pkt->ackinfo); len = iov[0].iov_len + iov[1].iov_len; serial = atomic_inc_return(&conn->serial); pkt->whdr.serial = htonl(serial); trace_rxrpc_tx_ack(call->debug_id, serial, ntohl(pkt->ack.firstPacket), ntohl(pkt->ack.serial), pkt->ack.reason, pkt->ack.nAcks); if (_serial) *_serial = serial; if (ping) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_ping); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_ack); else trace_rxrpc_tx_packet(call->debug_id, &pkt->whdr, rxrpc_tx_point_call_ack); rxrpc_tx_backoff(call, ret); if (call->state < RXRPC_CALL_COMPLETE) { if (ret < 0) { rxrpc_cancel_rtt_probe(call, serial, rtt_slot); rxrpc_propose_ACK(call, pkt->ack.reason, ntohl(pkt->ack.serial), false, true, rxrpc_propose_ack_retry_tx); } rxrpc_set_keepalive(call); } out: kfree(pkt); return ret; } /* * Send an ABORT call packet. */ int rxrpc_send_abort_packet(struct rxrpc_call *call) { struct rxrpc_connection *conn; struct rxrpc_abort_buffer pkt; struct msghdr msg; struct kvec iov[1]; rxrpc_serial_t serial; int ret; /* Don't bother sending aborts for a client call once the server has * hard-ACK'd all of its request data. After that point, we're not * going to stop the operation proceeding, and whilst we might limit * the reply, it's not worth it if we can send a new call on the same * channel instead, thereby closing off this call. */ if (rxrpc_is_client_call(call) && test_bit(RXRPC_CALL_TX_LAST, &call->flags)) return 0; if (test_bit(RXRPC_CALL_DISCONNECTED, &call->flags)) return -ECONNRESET; conn = call->conn; msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; pkt.whdr.epoch = htonl(conn->proto.epoch); pkt.whdr.cid = htonl(call->cid); pkt.whdr.callNumber = htonl(call->call_id); pkt.whdr.seq = 0; pkt.whdr.type = RXRPC_PACKET_TYPE_ABORT; pkt.whdr.flags = conn->out_clientflag; pkt.whdr.userStatus = 0; pkt.whdr.securityIndex = call->security_ix; pkt.whdr._rsvd = 0; pkt.whdr.serviceId = htons(call->service_id); pkt.abort_code = htonl(call->abort_code); iov[0].iov_base = &pkt; iov[0].iov_len = sizeof(pkt); serial = atomic_inc_return(&conn->serial); pkt.whdr.serial = htonl(serial); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 1, sizeof(pkt)); conn->params.peer->last_tx_at = ktime_get_seconds(); if (ret < 0) trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_abort); else trace_rxrpc_tx_packet(call->debug_id, &pkt.whdr, rxrpc_tx_point_call_abort); rxrpc_tx_backoff(call, ret); return ret; } /* * send a packet through the transport endpoint */ int rxrpc_send_data_packet(struct rxrpc_call *call, struct sk_buff *skb, bool retrans) { struct rxrpc_connection *conn = call->conn; struct rxrpc_wire_header whdr; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct msghdr msg; struct kvec iov[2]; rxrpc_serial_t serial; size_t len; int ret, rtt_slot = -1; _enter(",{%d}", skb->len); if (hlist_unhashed(&call->error_link)) { spin_lock_bh(&call->peer->lock); hlist_add_head_rcu(&call->error_link, &call->peer->error_targets); spin_unlock_bh(&call->peer->lock); } /* Each transmission of a Tx packet needs a new serial number */ serial = atomic_inc_return(&conn->serial); whdr.epoch = htonl(conn->proto.epoch); whdr.cid = htonl(call->cid); whdr.callNumber = htonl(call->call_id); whdr.seq = htonl(sp->hdr.seq); whdr.serial = htonl(serial); whdr.type = RXRPC_PACKET_TYPE_DATA; whdr.flags = sp->hdr.flags; whdr.userStatus = 0; whdr.securityIndex = call->security_ix; whdr._rsvd = htons(sp->hdr._rsvd); whdr.serviceId = htons(call->service_id); if (test_bit(RXRPC_CONN_PROBING_FOR_UPGRADE, &conn->flags) && sp->hdr.seq == 1) whdr.userStatus = RXRPC_USERSTATUS_SERVICE_UPGRADE; iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); iov[1].iov_base = skb->head; iov[1].iov_len = skb->len; len = iov[0].iov_len + iov[1].iov_len; msg.msg_name = &call->peer->srx.transport; msg.msg_namelen = call->peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; /* If our RTT cache needs working on, request an ACK. Also request * ACKs if a DATA packet appears to have been lost. * * However, we mustn't request an ACK on the last reply packet of a * service call, lest OpenAFS incorrectly send us an ACK with some * soft-ACKs in it and then never follow up with a proper hard ACK. */ if ((!(sp->hdr.flags & RXRPC_LAST_PACKET) || rxrpc_to_server(sp) ) && (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events) || retrans || call->cong_mode == RXRPC_CALL_SLOW_START || (call->peer->rtt_count < 3 && sp->hdr.seq & 1) || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), ktime_get_real()))) whdr.flags |= RXRPC_REQUEST_ACK; if (IS_ENABLED(CONFIG_AF_RXRPC_INJECT_LOSS)) { static int lose; if ((lose++ & 7) == 7) { ret = 0; trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, retrans, true); goto done; } } trace_rxrpc_tx_data(call, sp->hdr.seq, serial, whdr.flags, retrans, false); /* send the packet with the don't fragment bit set if we currently * think it's small enough */ if (iov[1].iov_len >= call->peer->maxdata) goto send_fragmentable; down_read(&conn->params.local->defrag_sem); sp->hdr.serial = serial; smp_wmb(); /* Set serial before timestamp */ skb->tstamp = ktime_get_real(); if (whdr.flags & RXRPC_REQUEST_ACK) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); /* send the packet by UDP * - returns -EMSGSIZE if UDP would have to fragment the packet * to go out of the interface * - in which case, we'll have processed the ICMP error * message and update the peer record */ ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); up_read(&conn->params.local->defrag_sem); if (ret < 0) { rxrpc_cancel_rtt_probe(call, serial, rtt_slot); trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_data_nofrag); } else { trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_nofrag); } rxrpc_tx_backoff(call, ret); if (ret == -EMSGSIZE) goto send_fragmentable; done: if (ret >= 0) { if (whdr.flags & RXRPC_REQUEST_ACK) { call->peer->rtt_last_req = skb->tstamp; if (call->peer->rtt_count > 1) { unsigned long nowj = jiffies, ack_lost_at; ack_lost_at = rxrpc_get_rto_backoff(call->peer, false); ack_lost_at += nowj; WRITE_ONCE(call->ack_lost_at, ack_lost_at); rxrpc_reduce_call_timer(call, ack_lost_at, nowj, rxrpc_timer_set_for_lost_ack); } } if (sp->hdr.seq == 1 && !test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) { unsigned long nowj = jiffies, expect_rx_by; expect_rx_by = nowj + call->next_rx_timo; WRITE_ONCE(call->expect_rx_by, expect_rx_by); rxrpc_reduce_call_timer(call, expect_rx_by, nowj, rxrpc_timer_set_for_normal); } rxrpc_set_keepalive(call); } else { /* Cancel the call if the initial transmission fails, * particularly if that's due to network routing issues that * aren't going away anytime soon. The layer above can arrange * the retransmission. */ if (!test_and_set_bit(RXRPC_CALL_BEGAN_RX_TIMER, &call->flags)) rxrpc_set_call_completion(call, RXRPC_CALL_LOCAL_ERROR, RX_USER_ABORT, ret); } _leave(" = %d [%u]", ret, call->peer->maxdata); return ret; send_fragmentable: /* attempt to send this message with fragmentation enabled */ _debug("send fragment"); down_write(&conn->params.local->defrag_sem); sp->hdr.serial = serial; smp_wmb(); /* Set serial before timestamp */ skb->tstamp = ktime_get_real(); if (whdr.flags & RXRPC_REQUEST_ACK) rtt_slot = rxrpc_begin_rtt_probe(call, serial, rxrpc_rtt_tx_data); switch (conn->params.local->srx.transport.family) { case AF_INET6: case AF_INET: ip_sock_set_mtu_discover(conn->params.local->socket->sk, IP_PMTUDISC_DONT); ret = kernel_sendmsg(conn->params.local->socket, &msg, iov, 2, len); conn->params.peer->last_tx_at = ktime_get_seconds(); ip_sock_set_mtu_discover(conn->params.local->socket->sk, IP_PMTUDISC_DO); break; default: BUG(); } if (ret < 0) { rxrpc_cancel_rtt_probe(call, serial, rtt_slot); trace_rxrpc_tx_fail(call->debug_id, serial, ret, rxrpc_tx_point_call_data_frag); } else { trace_rxrpc_tx_packet(call->debug_id, &whdr, rxrpc_tx_point_call_data_frag); } rxrpc_tx_backoff(call, ret); up_write(&conn->params.local->defrag_sem); goto done; } /* * reject packets through the local endpoint */ void rxrpc_reject_packets(struct rxrpc_local *local) { struct sockaddr_rxrpc srx; struct rxrpc_skb_priv *sp; struct rxrpc_wire_header whdr; struct sk_buff *skb; struct msghdr msg; struct kvec iov[2]; size_t size; __be32 code; int ret, ioc; _enter("%d", local->debug_id); iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); iov[1].iov_base = &code; iov[1].iov_len = sizeof(code); msg.msg_name = &srx.transport; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; memset(&whdr, 0, sizeof(whdr)); while ((skb = skb_dequeue(&local->reject_queue))) { rxrpc_see_skb(skb, rxrpc_skb_seen); sp = rxrpc_skb(skb); switch (skb->mark) { case RXRPC_SKB_MARK_REJECT_BUSY: whdr.type = RXRPC_PACKET_TYPE_BUSY; size = sizeof(whdr); ioc = 1; break; case RXRPC_SKB_MARK_REJECT_ABORT: whdr.type = RXRPC_PACKET_TYPE_ABORT; code = htonl(skb->priority); size = sizeof(whdr) + sizeof(code); ioc = 2; break; default: rxrpc_free_skb(skb, rxrpc_skb_freed); continue; } if (rxrpc_extract_addr_from_skb(&srx, skb) == 0) { msg.msg_namelen = srx.transport_len; whdr.epoch = htonl(sp->hdr.epoch); whdr.cid = htonl(sp->hdr.cid); whdr.callNumber = htonl(sp->hdr.callNumber); whdr.serviceId = htons(sp->hdr.serviceId); whdr.flags = sp->hdr.flags; whdr.flags ^= RXRPC_CLIENT_INITIATED; whdr.flags &= RXRPC_CLIENT_INITIATED; ret = kernel_sendmsg(local->socket, &msg, iov, ioc, size); if (ret < 0) trace_rxrpc_tx_fail(local->debug_id, 0, ret, rxrpc_tx_point_reject); else trace_rxrpc_tx_packet(local->debug_id, &whdr, rxrpc_tx_point_reject); } rxrpc_free_skb(skb, rxrpc_skb_freed); } _leave(""); } /* * Send a VERSION reply to a peer as a keepalive. */ void rxrpc_send_keepalive(struct rxrpc_peer *peer) { struct rxrpc_wire_header whdr; struct msghdr msg; struct kvec iov[2]; size_t len; int ret; _enter(""); msg.msg_name = &peer->srx.transport; msg.msg_namelen = peer->srx.transport_len; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; whdr.epoch = htonl(peer->local->rxnet->epoch); whdr.cid = 0; whdr.callNumber = 0; whdr.seq = 0; whdr.serial = 0; whdr.type = RXRPC_PACKET_TYPE_VERSION; /* Not client-initiated */ whdr.flags = RXRPC_LAST_PACKET; whdr.userStatus = 0; whdr.securityIndex = 0; whdr._rsvd = 0; whdr.serviceId = 0; iov[0].iov_base = &whdr; iov[0].iov_len = sizeof(whdr); iov[1].iov_base = (char *)rxrpc_keepalive_string; iov[1].iov_len = sizeof(rxrpc_keepalive_string); len = iov[0].iov_len + iov[1].iov_len; _proto("Tx VERSION (keepalive)"); ret = kernel_sendmsg(peer->local->socket, &msg, iov, 2, len); if (ret < 0) trace_rxrpc_tx_fail(peer->debug_id, 0, ret, rxrpc_tx_point_version_keepalive); else trace_rxrpc_tx_packet(peer->debug_id, &whdr, rxrpc_tx_point_version_keepalive); peer->last_tx_at = ktime_get_seconds(); _leave(""); }
131 131 130 131 502 503 403 404 404 404 506 504 404 506 131 130 130 129 497 500 401 400 402 504 505 402 503 130 498 497 114 114 522 394 123 123 394 25 369 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 // SPDX-License-Identifier: GPL-2.0-or-later /* * net-sysfs.c - network device class and attributes * * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/sched/isolation.h> #include <linux/nsproxy.h> #include <net/sock.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/vmalloc.h> #include <linux/export.h> #include <linux/jiffies.h> #include <linux/pm_runtime.h> #include <linux/of.h> #include <linux/of_net.h> #include <linux/cpu.h> #include "net-sysfs.h" #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; /* Caller holds RTNL or dev_base_lock */ static inline int dev_isalive(const struct net_device *dev) { return dev->reg_state <= NETREG_REGISTERED; } /* use same locking rules as GIF* ioctl's */ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, ssize_t (*format)(const struct net_device *, char *)) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; read_lock(&dev_base_lock); if (dev_isalive(ndev)) ret = (*format)(ndev, buf); read_unlock(&dev_base_lock); return ret; } /* generate a show function for simple field */ #define NETDEVICE_SHOW(field, format_string) \ static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ return sprintf(buf, format_string, dev->field); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ return netdev_show(dev, attr, buf, format_##field); \ } \ #define NETDEVICE_SHOW_RO(field, format_string) \ NETDEVICE_SHOW(field, format_string); \ static DEVICE_ATTR_RO(field) #define NETDEVICE_SHOW_RW(field, format_string) \ NETDEVICE_SHOW(field, format_string); \ static DEVICE_ATTR_RW(field) /* use same locking and permission rules as SIF* ioctl's */ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len, int (*set)(struct net_device *, unsigned long)) { struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); unsigned long new; int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = kstrtoul(buf, 0, &new); if (ret) goto err; if (!rtnl_trylock()) return restart_syscall(); if (dev_isalive(netdev)) { ret = (*set)(netdev, new); if (ret == 0) ret = len; } rtnl_unlock(); err: return ret; } NETDEVICE_SHOW_RO(dev_id, fmt_hex); NETDEVICE_SHOW_RO(dev_port, fmt_dec); NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); NETDEVICE_SHOW_RO(addr_len, fmt_dec); NETDEVICE_SHOW_RO(ifindex, fmt_dec); NETDEVICE_SHOW_RO(type, fmt_dec); NETDEVICE_SHOW_RO(link_mode, fmt_dec); static ssize_t iflink_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); return sprintf(buf, fmt_dec, dev_get_iflink(ndev)); } static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { return sprintf(buf, fmt_dec, dev->name_assign_type); } static ssize_t name_assign_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; if (ndev->name_assign_type != NET_NAME_UNKNOWN) ret = netdev_show(dev, attr, buf, format_name_assign_type); return ret; } static DEVICE_ATTR_RO(name_assign_type); /* use same locking rules as GIFHWADDR ioctl's */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; read_lock(&dev_base_lock); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); read_unlock(&dev_base_lock); return ret; } static DEVICE_ATTR_RO(address); static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); if (dev_isalive(ndev)) return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); return -EINVAL; } static DEVICE_ATTR_RO(broadcast); static int change_carrier(struct net_device *dev, unsigned long new_carrier) { if (!netif_running(dev)) return -EINVAL; return dev_change_carrier(dev, (bool)new_carrier); } static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); /* The check is also done in change_carrier; this helps returning early * without hitting the trylock/restart in netdev_store. */ if (!netdev->netdev_ops->ndo_change_carrier) return -EOPNOTSUPP; return netdev_store(dev, attr, buf, len, change_carrier); } static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev)); return -EINVAL; } static DEVICE_ATTR_RW(carrier); static ssize_t speed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps * returning early without hitting the trylock/restart below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; if (!rtnl_trylock()) return restart_syscall(); if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) ret = sprintf(buf, fmt_dec, cmd.base.speed); } rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(speed); static ssize_t duplex_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps * returning early without hitting the trylock/restart below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; if (!rtnl_trylock()) return restart_syscall(); if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) { const char *duplex; switch (cmd.base.duplex) { case DUPLEX_HALF: duplex = "half"; break; case DUPLEX_FULL: duplex = "full"; break; default: duplex = "unknown"; break; } ret = sprintf(buf, "%s\n", duplex); } } rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(duplex); static ssize_t testing_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) return sprintf(buf, fmt_dec, !!netif_testing(netdev)); return -EINVAL; } static DEVICE_ATTR_RO(testing); static ssize_t dormant_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) return sprintf(buf, fmt_dec, !!netif_dormant(netdev)); return -EINVAL; } static DEVICE_ATTR_RO(dormant); static const char *const operstates[] = { "unknown", "notpresent", /* currently unused */ "down", "lowerlayerdown", "testing", "dormant", "up" }; static ssize_t operstate_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); unsigned char operstate; read_lock(&dev_base_lock); operstate = netdev->operstate; if (!netif_running(netdev)) operstate = IF_OPER_DOWN; read_unlock(&dev_base_lock); if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ return sprintf(buf, "%s\n", operstates[operstate]); } static DEVICE_ATTR_RO(operstate); static ssize_t carrier_changes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count) + atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_changes); static ssize_t carrier_up_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); } static DEVICE_ATTR_RO(carrier_up_count); static ssize_t carrier_down_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_down_count); /* read-write attributes */ static int change_mtu(struct net_device *dev, unsigned long new_mtu) { return dev_set_mtu(dev, (int)new_mtu); } static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_mtu); } NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *dev, unsigned long new_flags) { return dev_change_flags(dev, (unsigned int)new_flags, NULL); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_flags); } NETDEVICE_SHOW_RW(flags, fmt_hex); static ssize_t tx_queue_len_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len); } NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { WRITE_ONCE(dev->gro_flush_timeout, val); return 0; } static ssize_t gro_flush_timeout_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; return netdev_store(dev, attr, buf, len, change_gro_flush_timeout); } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) { WRITE_ONCE(dev->napi_defer_hard_irqs, val); return 0; } static ssize_t napi_defer_hard_irqs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); } NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); size_t count = len; ssize_t ret = 0; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; /* ignore trailing newline */ if (len > 0 && buf[len - 1] == '\n') --count; if (!rtnl_trylock()) return restart_syscall(); if (dev_isalive(netdev)) { ret = dev_set_alias(netdev, buf, count); if (ret < 0) goto err; ret = len; netdev_state_change(netdev); } err: rtnl_unlock(); return ret; } static ssize_t ifalias_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); char tmp[IFALIASZ]; ssize_t ret = 0; ret = dev_get_alias(netdev, tmp, sizeof(tmp)); if (ret > 0) ret = sprintf(buf, "%s\n", tmp); return ret; } static DEVICE_ATTR_RW(ifalias); static int change_group(struct net_device *dev, unsigned long new_group) { dev_set_group(dev, (int)new_group); return 0; } static ssize_t group_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_group); } NETDEVICE_SHOW(group, fmt_dec); static DEVICE_ATTR(netdev_group, 0644, group_show, group_store); static int change_proto_down(struct net_device *dev, unsigned long proto_down) { return dev_change_proto_down(dev, (bool)proto_down); } static ssize_t proto_down_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); /* The check is also done in change_proto_down; this helps returning * early without hitting the trylock/restart in netdev_store. */ if (!netdev->netdev_ops->ndo_change_proto_down) return -EOPNOTSUPP; return netdev_store(dev, attr, buf, len, change_proto_down); } NETDEVICE_SHOW_RW(proto_down, fmt_dec); static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; /* The check is also done in dev_get_phys_port_id; this helps returning * early without hitting the trylock/restart below. */ if (!netdev->netdev_ops->ndo_get_phys_port_id) return -EOPNOTSUPP; if (!rtnl_trylock()) return restart_syscall(); if (dev_isalive(netdev)) { struct netdev_phys_item_id ppid; ret = dev_get_phys_port_id(netdev, &ppid); if (!ret) ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); } rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_port_id); static ssize_t phys_port_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; /* The checks are also done in dev_get_phys_port_name; this helps * returning early without hitting the trylock/restart below. */ if (!netdev->netdev_ops->ndo_get_phys_port_name && !netdev->netdev_ops->ndo_get_devlink_port) return -EOPNOTSUPP; if (!rtnl_trylock()) return restart_syscall(); if (dev_isalive(netdev)) { char name[IFNAMSIZ]; ret = dev_get_phys_port_name(netdev, name, sizeof(name)); if (!ret) ret = sprintf(buf, "%s\n", name); } rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_port_name); static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; /* The checks are also done in dev_get_phys_port_name; this helps * returning early without hitting the trylock/restart below. This works * because recurse is false when calling dev_get_port_parent_id. */ if (!netdev->netdev_ops->ndo_get_port_parent_id && !netdev->netdev_ops->ndo_get_devlink_port) return -EOPNOTSUPP; if (!rtnl_trylock()) return restart_syscall(); if (dev_isalive(netdev)) { struct netdev_phys_item_id ppid = { }; ret = dev_get_port_parent_id(netdev, &ppid, false); if (!ret) ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); } rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_switch_id); static ssize_t threaded_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; if (!rtnl_trylock()) return restart_syscall(); if (dev_isalive(netdev)) ret = sprintf(buf, fmt_dec, netdev->threaded); rtnl_unlock(); return ret; } static int modify_napi_threaded(struct net_device *dev, unsigned long val) { int ret; if (list_empty(&dev->napi_list)) return -EOPNOTSUPP; if (val != 0 && val != 1) return -EOPNOTSUPP; ret = dev_set_threaded(dev, val); return ret; } static ssize_t threaded_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, modify_napi_threaded); } static DEVICE_ATTR_RW(threaded); static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, &dev_attr_dev_id.attr, &dev_attr_dev_port.attr, &dev_attr_iflink.attr, &dev_attr_ifindex.attr, &dev_attr_name_assign_type.attr, &dev_attr_addr_assign_type.attr, &dev_attr_addr_len.attr, &dev_attr_link_mode.attr, &dev_attr_address.attr, &dev_attr_broadcast.attr, &dev_attr_speed.attr, &dev_attr_duplex.attr, &dev_attr_dormant.attr, &dev_attr_testing.attr, &dev_attr_operstate.attr, &dev_attr_carrier_changes.attr, &dev_attr_ifalias.attr, &dev_attr_carrier.attr, &dev_attr_mtu.attr, &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_napi_defer_hard_irqs.attr, &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, &dev_attr_proto_down.attr, &dev_attr_carrier_up_count.attr, &dev_attr_carrier_down_count.attr, &dev_attr_threaded.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); /* Show a given an attribute in the statistics group */ static ssize_t netstat_show(const struct device *d, struct device_attribute *attr, char *buf, unsigned long offset) { struct net_device *dev = to_net_dev(d); ssize_t ret = -EINVAL; WARN_ON(offset > sizeof(struct rtnl_link_stats64) || offset % sizeof(u64) != 0); read_lock(&dev_base_lock); if (dev_isalive(dev)) { struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } read_unlock(&dev_base_lock); return ret; } /* generate a read-only statistics attribute */ #define NETSTAT_ENTRY(name) \ static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *buf) \ { \ return netstat_show(d, attr, buf, \ offsetof(struct rtnl_link_stats64, name)); \ } \ static DEVICE_ATTR_RO(name) NETSTAT_ENTRY(rx_packets); NETSTAT_ENTRY(tx_packets); NETSTAT_ENTRY(rx_bytes); NETSTAT_ENTRY(tx_bytes); NETSTAT_ENTRY(rx_errors); NETSTAT_ENTRY(tx_errors); NETSTAT_ENTRY(rx_dropped); NETSTAT_ENTRY(tx_dropped); NETSTAT_ENTRY(multicast); NETSTAT_ENTRY(collisions); NETSTAT_ENTRY(rx_length_errors); NETSTAT_ENTRY(rx_over_errors); NETSTAT_ENTRY(rx_crc_errors); NETSTAT_ENTRY(rx_frame_errors); NETSTAT_ENTRY(rx_fifo_errors); NETSTAT_ENTRY(rx_missed_errors); NETSTAT_ENTRY(tx_aborted_errors); NETSTAT_ENTRY(tx_carrier_errors); NETSTAT_ENTRY(tx_fifo_errors); NETSTAT_ENTRY(tx_heartbeat_errors); NETSTAT_ENTRY(tx_window_errors); NETSTAT_ENTRY(rx_compressed); NETSTAT_ENTRY(tx_compressed); NETSTAT_ENTRY(rx_nohandler); static struct attribute *netstat_attrs[] __ro_after_init = { &dev_attr_rx_packets.attr, &dev_attr_tx_packets.attr, &dev_attr_rx_bytes.attr, &dev_attr_tx_bytes.attr, &dev_attr_rx_errors.attr, &dev_attr_tx_errors.attr, &dev_attr_rx_dropped.attr, &dev_attr_tx_dropped.attr, &dev_attr_multicast.attr, &dev_attr_collisions.attr, &dev_attr_rx_length_errors.attr, &dev_attr_rx_over_errors.attr, &dev_attr_rx_crc_errors.attr, &dev_attr_rx_frame_errors.attr, &dev_attr_rx_fifo_errors.attr, &dev_attr_rx_missed_errors.attr, &dev_attr_tx_aborted_errors.attr, &dev_attr_tx_carrier_errors.attr, &dev_attr_tx_fifo_errors.attr, &dev_attr_tx_heartbeat_errors.attr, &dev_attr_tx_window_errors.attr, &dev_attr_rx_compressed.attr, &dev_attr_tx_compressed.attr, &dev_attr_rx_nohandler.attr, NULL }; static const struct attribute_group netstat_group = { .name = "statistics", .attrs = netstat_attrs, }; #if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) static struct attribute *wireless_attrs[] = { NULL }; static const struct attribute_group wireless_group = { .name = "wireless", .attrs = wireless_attrs, }; #endif #else /* CONFIG_SYSFS */ #define net_class_groups NULL #endif /* CONFIG_SYSFS */ #ifdef CONFIG_SYSFS #define to_rx_queue_attr(_attr) \ container_of(_attr, struct rx_queue_attribute, attr) #define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); struct netdev_rx_queue *queue = to_rx_queue(kobj); if (!attribute->show) return -EIO; return attribute->show(queue, buf); } static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); struct netdev_rx_queue *queue = to_rx_queue(kobj); if (!attribute->store) return -EIO; return attribute->store(queue, buf, count); } static const struct sysfs_ops rx_queue_sysfs_ops = { .show = rx_queue_attr_show, .store = rx_queue_attr_store, }; #ifdef CONFIG_RPS static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) { struct rps_map *map; cpumask_var_t mask; int i, len; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; rcu_read_lock(); map = rcu_dereference(queue->rps_map); if (map) for (i = 0; i < map->len; i++) cpumask_set_cpu(map->cpus[i], mask); len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); rcu_read_unlock(); free_cpumask_var(mask); return len < PAGE_SIZE ? len : -EINVAL; } static ssize_t store_rps_map(struct netdev_rx_queue *queue, const char *buf, size_t len) { struct rps_map *old_map, *map; cpumask_var_t mask; int err, cpu, i, hk_flags; static DEFINE_MUTEX(rps_map_mutex); if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); if (err) { free_cpumask_var(mask); return err; } if (!cpumask_empty(mask)) { hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; cpumask_and(mask, mask, housekeeping_cpumask(hk_flags)); if (cpumask_empty(mask)) { free_cpumask_var(mask); return -EINVAL; } } map = kzalloc(max_t(unsigned int, RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), GFP_KERNEL); if (!map) { free_cpumask_var(mask); return -ENOMEM; } i = 0; for_each_cpu_and(cpu, mask, cpu_online_mask) map->cpus[i++] = cpu; if (i) { map->len = i; } else { kfree(map); map = NULL; } mutex_lock(&rps_map_mutex); old_map = rcu_dereference_protected(queue->rps_map, mutex_is_locked(&rps_map_mutex)); rcu_assign_pointer(queue->rps_map, map); if (map) static_branch_inc(&rps_needed); if (old_map) static_branch_dec(&rps_needed); mutex_unlock(&rps_map_mutex); if (old_map) kfree_rcu(old_map, rcu); free_cpumask_var(mask); return len; } static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, char *buf) { struct rps_dev_flow_table *flow_table; unsigned long val = 0; rcu_read_lock(); flow_table = rcu_dereference(queue->rps_flow_table); if (flow_table) val = (unsigned long)flow_table->mask + 1; rcu_read_unlock(); return sprintf(buf, "%lu\n", val); } static void rps_dev_flow_table_release(struct rcu_head *rcu) { struct rps_dev_flow_table *table = container_of(rcu, struct rps_dev_flow_table, rcu); vfree(table); } static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, const char *buf, size_t len) { unsigned long mask, count; struct rps_dev_flow_table *table, *old_table; static DEFINE_SPINLOCK(rps_dev_flow_lock); int rc; if (!capable(CAP_NET_ADMIN)) return -EPERM; rc = kstrtoul(buf, 0, &count); if (rc < 0) return rc; if (count) { mask = count - 1; /* mask = roundup_pow_of_two(count) - 1; * without overflows... */ while ((mask | (mask >> 1)) != mask) mask |= (mask >> 1); /* On 64 bit arches, must check mask fits in table->mask (u32), * and on 32bit arches, must check * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow. */ #if BITS_PER_LONG > 32 if (mask > (unsigned long)(u32)mask) return -EINVAL; #else if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1)) / sizeof(struct rps_dev_flow)) { /* Enforce a limit to prevent overflow */ return -EINVAL; } #endif table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1)); if (!table) return -ENOMEM; table->mask = mask; for (count = 0; count <= mask; count++) table->flows[count].cpu = RPS_NO_CPU; } else { table = NULL; } spin_lock(&rps_dev_flow_lock); old_table = rcu_dereference_protected(queue->rps_flow_table, lockdep_is_held(&rps_dev_flow_lock)); rcu_assign_pointer(queue->rps_flow_table, table); spin_unlock(&rps_dev_flow_lock); if (old_table) call_rcu(&old_table->rcu, rps_dev_flow_table_release); return len; } static struct rx_queue_attribute rps_cpus_attribute __ro_after_init = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map); static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init = __ATTR(rps_flow_cnt, 0644, show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); #endif /* CONFIG_RPS */ static struct attribute *rx_queue_default_attrs[] __ro_after_init = { #ifdef CONFIG_RPS &rps_cpus_attribute.attr, &rps_dev_flow_table_cnt_attribute.attr, #endif NULL }; ATTRIBUTE_GROUPS(rx_queue_default); static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); #ifdef CONFIG_RPS struct rps_map *map; struct rps_dev_flow_table *flow_table; map = rcu_dereference_protected(queue->rps_map, 1); if (map) { RCU_INIT_POINTER(queue->rps_map, NULL); kfree_rcu(map, rcu); } flow_table = rcu_dereference_protected(queue->rps_flow_table, 1); if (flow_table) { RCU_INIT_POINTER(queue->rps_flow_table, NULL); call_rcu(&flow_table->rcu, rps_dev_flow_table_release); } #endif memset(kobj, 0, sizeof(*kobj)); dev_put(queue->dev); } static const void *rx_queue_namespace(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); struct device *dev = &queue->dev->dev; const void *ns = NULL; if (dev->class && dev->class->ns_type) ns = dev->class->namespace(dev); return ns; } static void rx_queue_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) { const struct net *net = rx_queue_namespace(kobj); net_ns_get_ownership(net, uid, gid); } static struct kobj_type rx_queue_ktype __ro_after_init = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, .default_groups = rx_queue_default_groups, .namespace = rx_queue_namespace, .get_ownership = rx_queue_get_ownership, }; static int rx_queue_add_kobject(struct net_device *dev, int index) { struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error = 0; /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ dev_hold(queue->dev); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); if (error) goto err; if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) goto err; } kobject_uevent(kobj, KOBJ_ADD); return error; err: kobject_put(kobj); return error; } static int rx_queue_change_owner(struct net_device *dev, int index, kuid_t kuid, kgid_t kgid) { struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error; error = sysfs_change_owner(kobj, kuid, kgid); if (error) return error; if (dev->sysfs_rx_queue_group) error = sysfs_group_change_owner( kobj, dev->sysfs_rx_queue_group, kuid, kgid); return error; } #endif /* CONFIG_SYSFS */ int net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; #ifndef CONFIG_RPS if (!dev->sysfs_rx_queue_group) return 0; #endif for (i = old_num; i < new_num; i++) { error = rx_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; } } while (--i >= new_num) { struct kobject *kobj = &dev->_rx[i].kobj; if (!refcount_read(&dev_net(dev)->ns.count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); kobject_put(kobj); } return error; #else return 0; #endif } static int net_rx_queue_change_owner(struct net_device *dev, int num, kuid_t kuid, kgid_t kgid) { #ifdef CONFIG_SYSFS int error = 0; int i; #ifndef CONFIG_RPS if (!dev->sysfs_rx_queue_group) return 0; #endif for (i = 0; i < num; i++) { error = rx_queue_change_owner(dev, i, kuid, kgid); if (error) break; } return error; #else return 0; #endif } #ifdef CONFIG_SYSFS /* * netdev_queue sysfs structures and functions. */ struct netdev_queue_attribute { struct attribute attr; ssize_t (*show)(struct netdev_queue *queue, char *buf); ssize_t (*store)(struct netdev_queue *queue, const char *buf, size_t len); }; #define to_netdev_queue_attr(_attr) \ container_of(_attr, struct netdev_queue_attribute, attr) #define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) static ssize_t netdev_queue_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { const struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); struct netdev_queue *queue = to_netdev_queue(kobj); if (!attribute->show) return -EIO; return attribute->show(queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { const struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); struct netdev_queue *queue = to_netdev_queue(kobj); if (!attribute->store) return -EIO; return attribute->store(queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { .show = netdev_queue_attr_show, .store = netdev_queue_attr_store, }; static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) { unsigned long trans_timeout; spin_lock_irq(&queue->_xmit_lock); trans_timeout = queue->trans_timeout; spin_unlock_irq(&queue->_xmit_lock); return sprintf(buf, fmt_ulong, trans_timeout); } static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; unsigned int i; i = queue - dev->_tx; BUG_ON(i >= dev->num_tx_queues); return i; } static ssize_t traffic_class_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; int num_tc, tc; int index; if (!netif_is_multiqueue(dev)) return -ENOENT; if (!rtnl_trylock()) return restart_syscall(); index = get_netdev_queue_index(queue); /* If queue belongs to subordinate dev use its TC mapping */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; num_tc = dev->num_tc; tc = netdev_txq_to_tc(dev, index); rtnl_unlock(); if (tc < 0) return -EINVAL; /* We can report the traffic class one of two ways: * Subordinate device traffic classes are reported with the traffic * class first, and then the subordinate class so for example TC0 on * subordinate device 2 will be reported as "0-2". If the queue * belongs to the root device it will be reported with just the * traffic class, so just "0" for TC 0 for example. */ return num_tc < 0 ? sprintf(buf, "%d%d\n", tc, num_tc) : sprintf(buf, "%d\n", tc); } #ifdef CONFIG_XPS static ssize_t tx_maxrate_show(struct netdev_queue *queue, char *buf) { return sprintf(buf, "%lu\n", queue->tx_maxrate); } static ssize_t tx_maxrate_store(struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; int err, index = get_netdev_queue_index(queue); u32 rate = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; /* The check is also done later; this helps returning early without * hitting the trylock/restart below. */ if (!dev->netdev_ops->ndo_set_tx_maxrate) return -EOPNOTSUPP; err = kstrtou32(buf, 10, &rate); if (err < 0) return err; if (!rtnl_trylock()) return restart_syscall(); err = -EOPNOTSUPP; if (dev->netdev_ops->ndo_set_tx_maxrate) err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); rtnl_unlock(); if (!err) { queue->tx_maxrate = rate; return len; } return err; } static struct netdev_queue_attribute queue_tx_maxrate __ro_after_init = __ATTR_RW(tx_maxrate); #endif static struct netdev_queue_attribute queue_trans_timeout __ro_after_init = __ATTR_RO(tx_timeout); static struct netdev_queue_attribute queue_traffic_class __ro_after_init = __ATTR_RO(traffic_class); #ifdef CONFIG_BQL /* * Byte queue limits sysfs structures and functions. */ static ssize_t bql_show(char *buf, unsigned int value) { return sprintf(buf, "%u\n", value); } static ssize_t bql_set(const char *buf, const size_t count, unsigned int *pvalue) { unsigned int value; int err; if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) { value = DQL_MAX_LIMIT; } else { err = kstrtouint(buf, 10, &value); if (err < 0) return err; if (value > DQL_MAX_LIMIT) return -EINVAL; } *pvalue = value; return count; } static ssize_t bql_show_hold_time(struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } static ssize_t bql_set_hold_time(struct netdev_queue *queue, const char *buf, size_t len) { struct dql *dql = &queue->dql; unsigned int value; int err; err = kstrtouint(buf, 10, &value); if (err < 0) return err; dql->slack_hold_time = msecs_to_jiffies(value); return len; } static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); static ssize_t bql_show_inflight(struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed); } static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \ char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ } \ \ static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \ = __ATTR(NAME, 0644, \ bql_show_ ## NAME, bql_set_ ## NAME) BQL_ATTR(limit, limit); BQL_ATTR(limit_max, max_limit); BQL_ATTR(limit_min, min_limit); static struct attribute *dql_attrs[] __ro_after_init = { &bql_limit_attribute.attr, &bql_limit_max_attribute.attr, &bql_limit_min_attribute.attr, &bql_hold_time_attribute.attr, &bql_inflight_attribute.attr, NULL }; static const struct attribute_group dql_group = { .name = "byte_queue_limits", .attrs = dql_attrs, }; #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, int tc, char *buf, enum xps_map_type type) { struct xps_dev_maps *dev_maps; unsigned long *mask; unsigned int nr_ids; int j, len; rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps[type]); /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0 * when dev_maps hasn't been allocated yet, to be backward compatible. */ nr_ids = dev_maps ? dev_maps->nr_ids : (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues); mask = bitmap_zalloc(nr_ids, GFP_NOWAIT); if (!mask) { rcu_read_unlock(); return -ENOMEM; } if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; for (j = 0; j < nr_ids; j++) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; map = rcu_dereference(dev_maps->attr_map[tci]); if (!map) continue; for (i = map->len; i--;) { if (map->queues[i] == index) { set_bit(j, mask); break; } } } out_no_maps: rcu_read_unlock(); len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; } static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; int len, tc; if (!netif_is_multiqueue(dev)) return -ENOENT; index = get_netdev_queue_index(queue); if (!rtnl_trylock()) return restart_syscall(); /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); if (tc < 0) { rtnl_unlock(); return -EINVAL; } /* Make sure the subordinate device can't be freed */ get_device(&dev->dev); rtnl_unlock(); len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); put_device(&dev->dev); return len; } static ssize_t xps_cpus_store(struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; unsigned int index; cpumask_var_t mask; int err; if (!netif_is_multiqueue(dev)) return -ENOENT; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; index = get_netdev_queue_index(queue); err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); if (err) { free_cpumask_var(mask); return err; } if (!rtnl_trylock()) { free_cpumask_var(mask); return restart_syscall(); } err = netif_set_xps_queue(dev, mask, index); rtnl_unlock(); free_cpumask_var(mask); return err ? : len; } static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; int tc; index = get_netdev_queue_index(queue); if (!rtnl_trylock()) return restart_syscall(); tc = netdev_txq_to_tc(dev, index); rtnl_unlock(); if (tc < 0) return -EINVAL; return xps_queue_show(dev, index, tc, buf, XPS_RXQS); } static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; struct net *net = dev_net(dev); unsigned long *mask; unsigned int index; int err; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); if (!mask) return -ENOMEM; index = get_netdev_queue_index(queue); err = bitmap_parse(buf, len, mask, dev->num_rx_queues); if (err) { bitmap_free(mask); return err; } if (!rtnl_trylock()) { bitmap_free(mask); return restart_syscall(); } cpus_read_lock(); err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS); cpus_read_unlock(); rtnl_unlock(); bitmap_free(mask); return err ? : len; } static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init = __ATTR_RW(xps_rxqs); #endif /* CONFIG_XPS */ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { &queue_trans_timeout.attr, &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, &xps_rxqs_attribute.attr, &queue_tx_maxrate.attr, #endif NULL }; ATTRIBUTE_GROUPS(netdev_queue_default); static void netdev_queue_release(struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); memset(kobj, 0, sizeof(*kobj)); dev_put(queue->dev); } static const void *netdev_queue_namespace(struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); struct device *dev = &queue->dev->dev; const void *ns = NULL; if (dev->class && dev->class->ns_type) ns = dev->class->namespace(dev); return ns; } static void netdev_queue_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid) { const struct net *net = netdev_queue_namespace(kobj); net_ns_get_ownership(net, uid, gid); } static struct kobj_type netdev_queue_ktype __ro_after_init = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, .default_groups = netdev_queue_default_groups, .namespace = netdev_queue_namespace, .get_ownership = netdev_queue_get_ownership, }; static int netdev_queue_add_kobject(struct net_device *dev, int index) { struct netdev_queue *queue = dev->_tx + index; struct kobject *kobj = &queue->kobj; int error = 0; /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ dev_hold(queue->dev); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, "tx-%u", index); if (error) goto err; #ifdef CONFIG_BQL error = sysfs_create_group(kobj, &dql_group); if (error) goto err; #endif kobject_uevent(kobj, KOBJ_ADD); return 0; err: kobject_put(kobj); return error; } static int tx_queue_change_owner(struct net_device *ndev, int index, kuid_t kuid, kgid_t kgid) { struct netdev_queue *queue = ndev->_tx + index; struct kobject *kobj = &queue->kobj; int error; error = sysfs_change_owner(kobj, kuid, kgid); if (error) return error; #ifdef CONFIG_BQL error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); #endif return error; } #endif /* CONFIG_SYSFS */ int netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; for (i = old_num; i < new_num; i++) { error = netdev_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; } } while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; if (!refcount_read(&dev_net(dev)->ns.count)) queue->kobj.uevent_suppress = 1; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); #endif kobject_put(&queue->kobj); } return error; #else return 0; #endif /* CONFIG_SYSFS */ } static int net_tx_queue_change_owner(struct net_device *dev, int num, kuid_t kuid, kgid_t kgid) { #ifdef CONFIG_SYSFS int error = 0; int i; for (i = 0; i < num; i++) { error = tx_queue_change_owner(dev, i, kuid, kgid); if (error) break; } return error; #else return 0; #endif /* CONFIG_SYSFS */ } static int register_queue_kobjects(struct net_device *dev) { int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS dev->queues_kset = kset_create_and_add("queues", NULL, &dev->dev.kobj); if (!dev->queues_kset) return -ENOMEM; real_rx = dev->real_num_rx_queues; #endif real_tx = dev->real_num_tx_queues; error = net_rx_queue_update_kobjects(dev, 0, real_rx); if (error) goto error; rxq = real_rx; error = netdev_queue_update_kobjects(dev, 0, real_tx); if (error) goto error; txq = real_tx; return 0; error: netdev_queue_update_kobjects(dev, txq, 0); net_rx_queue_update_kobjects(dev, rxq, 0); #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif return error; } static int queue_change_owner(struct net_device *ndev, kuid_t kuid, kgid_t kgid) { int error = 0, real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS if (ndev->queues_kset) { error = sysfs_change_owner(&ndev->queues_kset->kobj, kuid, kgid); if (error) return error; } real_rx = ndev->real_num_rx_queues; #endif real_tx = ndev->real_num_tx_queues; error = net_rx_queue_change_owner(ndev, real_rx, kuid, kgid); if (error) return error; error = net_tx_queue_change_owner(ndev, real_tx, kuid, kgid); if (error) return error; return 0; } static void remove_queue_kobjects(struct net_device *dev) { int real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS real_rx = dev->real_num_rx_queues; #endif real_tx = dev->real_num_tx_queues; net_rx_queue_update_kobjects(dev, real_rx, 0); netdev_queue_update_kobjects(dev, real_tx, 0); dev->real_num_rx_queues = 0; dev->real_num_tx_queues = 0; #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif } static bool net_current_may_mount(void) { struct net *net = current->nsproxy->net_ns; return ns_capable(net->user_ns, CAP_SYS_ADMIN); } static void *net_grab_current_ns(void) { struct net *ns = current->nsproxy->net_ns; #ifdef CONFIG_NET_NS if (ns) refcount_inc(&ns->passive); #endif return ns; } static const void *net_initial_ns(void) { return &init_net; } static const void *net_netlink_ns(struct sock *sk) { return sock_net(sk); } const struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, .current_may_mount = net_current_may_mount, .grab_current_ns = net_grab_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, .drop_ns = net_drop_ns, }; EXPORT_SYMBOL_GPL(net_ns_type_operations); static int netdev_uevent(struct device *d, struct kobj_uevent_env *env) { struct net_device *dev = to_net_dev(d); int retval; /* pass interface to uevent. */ retval = add_uevent_var(env, "INTERFACE=%s", dev->name); if (retval) goto exit; /* pass ifindex to uevent. * ifindex is useful as it won't change (interface name may change) * and is what RtNetlink uses natively. */ retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex); exit: return retval; } /* * netdev_release -- destroy and free a dead device. * Called when last reference to device kobject is gone. */ static void netdev_release(struct device *d) { struct net_device *dev = to_net_dev(d); BUG_ON(dev->reg_state != NETREG_RELEASED); /* no need to wait for rcu grace period: * device is dead and about to be freed. */ kfree(rcu_access_pointer(dev->ifalias)); netdev_freemem(dev); } static const void *net_namespace(struct device *d) { struct net_device *dev = to_net_dev(d); return dev_net(dev); } static void net_get_ownership(struct device *d, kuid_t *uid, kgid_t *gid) { struct net_device *dev = to_net_dev(d); const struct net *net = dev_net(dev); net_ns_get_ownership(net, uid, gid); } static struct class net_class __ro_after_init = { .name = "net", .dev_release = netdev_release, .dev_groups = net_class_groups, .dev_uevent = netdev_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, .get_ownership = net_get_ownership, }; #ifdef CONFIG_OF static int of_dev_node_match(struct device *dev, const void *data) { for (; dev; dev = dev->parent) { if (dev->of_node == data) return 1; } return 0; } /* * of_find_net_device_by_node - lookup the net device for the device node * @np: OF device node * * Looks up the net_device structure corresponding with the device node. * If successful, returns a pointer to the net_device with the embedded * struct device refcount incremented by one, or NULL on failure. The * refcount must be dropped when done with the net_device. */ struct net_device *of_find_net_device_by_node(struct device_node *np) { struct device *dev; dev = class_find_device(&net_class, NULL, np, of_dev_node_match); if (!dev) return NULL; return to_net_dev(dev); } EXPORT_SYMBOL(of_find_net_device_by_node); #endif /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. */ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; if (!refcount_read(&dev_net(ndev)->ns.count)) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); remove_queue_kobjects(ndev); pm_runtime_set_memalloc_noio(dev, false); device_del(dev); } /* Create sysfs entries for network device. */ int netdev_register_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; const struct attribute_group **groups = ndev->sysfs_groups; int error = 0; device_initialize(dev); dev->class = &net_class; dev->platform_data = ndev; dev->groups = groups; dev_set_name(dev, "%s", ndev->name); #ifdef CONFIG_SYSFS /* Allow for a device specific group */ if (*groups) groups++; *groups++ = &netstat_group; #if IS_ENABLED(CONFIG_WIRELESS_EXT) || IS_ENABLED(CONFIG_CFG80211) if (ndev->ieee80211_ptr) *groups++ = &wireless_group; #if IS_ENABLED(CONFIG_WIRELESS_EXT) else if (ndev->wireless_handlers) *groups++ = &wireless_group; #endif #endif #endif /* CONFIG_SYSFS */ error = device_add(dev); if (error) return error; error = register_queue_kobjects(ndev); if (error) { device_del(dev); return error; } pm_runtime_set_memalloc_noio(dev, true); return error; } /* Change owner for sysfs entries when moving network devices across network * namespaces owned by different user namespaces. */ int netdev_change_owner(struct net_device *ndev, const struct net *net_old, const struct net *net_new) { kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID; kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID; struct device *dev = &ndev->dev; int error; net_ns_get_ownership(net_old, &old_uid, &old_gid); net_ns_get_ownership(net_new, &new_uid, &new_gid); /* The network namespace was changed but the owning user namespace is * identical so there's no need to change the owner of sysfs entries. */ if (uid_eq(old_uid, new_uid) && gid_eq(old_gid, new_gid)) return 0; error = device_change_owner(dev, new_uid, new_gid); if (error) return error; error = queue_change_owner(ndev, new_uid, new_gid); if (error) return error; return 0; } int netdev_class_create_file_ns(const struct class_attribute *class_attr, const void *ns) { return class_create_file_ns(&net_class, class_attr, ns); } EXPORT_SYMBOL(netdev_class_create_file_ns); void netdev_class_remove_file_ns(const struct class_attribute *class_attr, const void *ns) { class_remove_file_ns(&net_class, class_attr, ns); } EXPORT_SYMBOL(netdev_class_remove_file_ns); int __init netdev_kobject_init(void) { kobj_ns_type_register(&net_ns_type_operations); return class_register(&net_class); }
5 4 2 2 2 2 2 1 2 1 1 4 1 2 1 1 1 5 1 2 2 1 1 4 3 4 1 3 1 3 4 1 4 11 11 11 2 11 2 12 12 2 1 2 2 11 1 1 1 2 2 20 20 21 3 4 4 4 4 4 4 4 4 4 1 1 1 18 35 18 18 921 925 5 4 4 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/net/macsec.c - MACsec device * * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net> */ #include <linux/types.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/module.h> #include <crypto/aead.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/refcount.h> #include <net/genetlink.h> #include <net/sock.h> #include <net/gro_cells.h> #include <net/macsec.h> #include <linux/phy.h> #include <linux/byteorder/generic.h> #include <linux/if_arp.h> #include <uapi/linux/if_macsec.h> #define MACSEC_SCI_LEN 8 /* SecTAG length = macsec_eth_header without the optional SCI */ #define MACSEC_TAG_LEN 6 struct macsec_eth_header { struct ethhdr eth; /* SecTAG */ u8 tci_an; #if defined(__LITTLE_ENDIAN_BITFIELD) u8 short_length:6, unused:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 unused:2, short_length:6; #else #error "Please fix <asm/byteorder.h>" #endif __be32 packet_number; u8 secure_channel_id[8]; /* optional */ } __packed; #define MACSEC_TCI_VERSION 0x80 #define MACSEC_TCI_ES 0x40 /* end station */ #define MACSEC_TCI_SC 0x20 /* SCI present */ #define MACSEC_TCI_SCB 0x10 /* epon */ #define MACSEC_TCI_E 0x08 /* encryption */ #define MACSEC_TCI_C 0x04 /* changed text */ #define MACSEC_AN_MASK 0x03 /* association number */ #define MACSEC_TCI_CONFID (MACSEC_TCI_E | MACSEC_TCI_C) /* minimum secure data length deemed "not short", see IEEE 802.1AE-2006 9.7 */ #define MIN_NON_SHORT_LEN 48 #define GCM_AES_IV_LEN 12 #define DEFAULT_ICV_LEN 16 #define for_each_rxsc(secy, sc) \ for (sc = rcu_dereference_bh(secy->rx_sc); \ sc; \ sc = rcu_dereference_bh(sc->next)) #define for_each_rxsc_rtnl(secy, sc) \ for (sc = rtnl_dereference(secy->rx_sc); \ sc; \ sc = rtnl_dereference(sc->next)) #define pn_same_half(pn1, pn2) (!(((pn1) >> 31) ^ ((pn2) >> 31))) struct gcm_iv_xpn { union { u8 short_secure_channel_id[4]; ssci_t ssci; }; __be64 pn; } __packed; struct gcm_iv { union { u8 secure_channel_id[8]; sci_t sci; }; __be32 pn; }; #define MACSEC_VALIDATE_DEFAULT MACSEC_VALIDATE_STRICT struct pcpu_secy_stats { struct macsec_dev_stats stats; struct u64_stats_sync syncp; }; /** * struct macsec_dev - private data * @secy: SecY config * @real_dev: pointer to underlying netdevice * @stats: MACsec device stats * @secys: linked list of SecY's on the underlying device * @gro_cells: pointer to the Generic Receive Offload cell * @offload: status of offloading on the MACsec device */ struct macsec_dev { struct macsec_secy secy; struct net_device *real_dev; struct pcpu_secy_stats __percpu *stats; struct list_head secys; struct gro_cells gro_cells; enum macsec_offload offload; }; /** * struct macsec_rxh_data - rx_handler private argument * @secys: linked list of SecY's on this underlying device */ struct macsec_rxh_data { struct list_head secys; }; static struct macsec_dev *macsec_priv(const struct net_device *dev) { return (struct macsec_dev *)netdev_priv(dev); } static struct macsec_rxh_data *macsec_data_rcu(const struct net_device *dev) { return rcu_dereference_bh(dev->rx_handler_data); } static struct macsec_rxh_data *macsec_data_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->rx_handler_data); } struct macsec_cb { struct aead_request *req; union { struct macsec_tx_sa *tx_sa; struct macsec_rx_sa *rx_sa; }; u8 assoc_num; bool valid; bool has_sci; }; static struct macsec_rx_sa *macsec_rxsa_get(struct macsec_rx_sa __rcu *ptr) { struct macsec_rx_sa *sa = rcu_dereference_bh(ptr); if (!sa || !sa->active) return NULL; if (!refcount_inc_not_zero(&sa->refcnt)) return NULL; return sa; } static void free_rx_sc_rcu(struct rcu_head *head) { struct macsec_rx_sc *rx_sc = container_of(head, struct macsec_rx_sc, rcu_head); free_percpu(rx_sc->stats); kfree(rx_sc); } static struct macsec_rx_sc *macsec_rxsc_get(struct macsec_rx_sc *sc) { return refcount_inc_not_zero(&sc->refcnt) ? sc : NULL; } static void macsec_rxsc_put(struct macsec_rx_sc *sc) { if (refcount_dec_and_test(&sc->refcnt)) call_rcu(&sc->rcu_head, free_rx_sc_rcu); } static void free_rxsa(struct rcu_head *head) { struct macsec_rx_sa *sa = container_of(head, struct macsec_rx_sa, rcu); crypto_free_aead(sa->key.tfm); free_percpu(sa->stats); kfree(sa); } static void macsec_rxsa_put(struct macsec_rx_sa *sa) { if (refcount_dec_and_test(&sa->refcnt)) call_rcu(&sa->rcu, free_rxsa); } static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr) { struct macsec_tx_sa *sa = rcu_dereference_bh(ptr); if (!sa || !sa->active) return NULL; if (!refcount_inc_not_zero(&sa->refcnt)) return NULL; return sa; } static void free_txsa(struct rcu_head *head) { struct macsec_tx_sa *sa = container_of(head, struct macsec_tx_sa, rcu); crypto_free_aead(sa->key.tfm); free_percpu(sa->stats); kfree(sa); } static void macsec_txsa_put(struct macsec_tx_sa *sa) { if (refcount_dec_and_test(&sa->refcnt)) call_rcu(&sa->rcu, free_txsa); } static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb) { BUILD_BUG_ON(sizeof(struct macsec_cb) > sizeof(skb->cb)); return (struct macsec_cb *)skb->cb; } #define MACSEC_PORT_ES (htons(0x0001)) #define MACSEC_PORT_SCB (0x0000) #define MACSEC_UNDEF_SCI ((__force sci_t)0xffffffffffffffffULL) #define MACSEC_UNDEF_SSCI ((__force ssci_t)0xffffffff) #define MACSEC_GCM_AES_128_SAK_LEN 16 #define MACSEC_GCM_AES_256_SAK_LEN 32 #define DEFAULT_SAK_LEN MACSEC_GCM_AES_128_SAK_LEN #define DEFAULT_XPN false #define DEFAULT_SEND_SCI true #define DEFAULT_ENCRYPT false #define DEFAULT_ENCODING_SA 0 #define MACSEC_XPN_MAX_REPLAY_WINDOW (((1 << 30) - 1)) static bool send_sci(const struct macsec_secy *secy) { const struct macsec_tx_sc *tx_sc = &secy->tx_sc; return tx_sc->send_sci || (secy->n_rx_sc > 1 && !tx_sc->end_station && !tx_sc->scb); } static sci_t make_sci(u8 *addr, __be16 port) { sci_t sci; memcpy(&sci, addr, ETH_ALEN); memcpy(((char *)&sci) + ETH_ALEN, &port, sizeof(port)); return sci; } static sci_t macsec_frame_sci(struct macsec_eth_header *hdr, bool sci_present) { sci_t sci; if (sci_present) memcpy(&sci, hdr->secure_channel_id, sizeof(hdr->secure_channel_id)); else sci = make_sci(hdr->eth.h_source, MACSEC_PORT_ES); return sci; } static unsigned int macsec_sectag_len(bool sci_present) { return MACSEC_TAG_LEN + (sci_present ? MACSEC_SCI_LEN : 0); } static unsigned int macsec_hdr_len(bool sci_present) { return macsec_sectag_len(sci_present) + ETH_HLEN; } static unsigned int macsec_extra_len(bool sci_present) { return macsec_sectag_len(sci_present) + sizeof(__be16); } /* Fill SecTAG according to IEEE 802.1AE-2006 10.5.3 */ static void macsec_fill_sectag(struct macsec_eth_header *h, const struct macsec_secy *secy, u32 pn, bool sci_present) { const struct macsec_tx_sc *tx_sc = &secy->tx_sc; memset(&h->tci_an, 0, macsec_sectag_len(sci_present)); h->eth.h_proto = htons(ETH_P_MACSEC); if (sci_present) { h->tci_an |= MACSEC_TCI_SC; memcpy(&h->secure_channel_id, &secy->sci, sizeof(h->secure_channel_id)); } else { if (tx_sc->end_station) h->tci_an |= MACSEC_TCI_ES; if (tx_sc->scb) h->tci_an |= MACSEC_TCI_SCB; } h->packet_number = htonl(pn); /* with GCM, C/E clear for !encrypt, both set for encrypt */ if (tx_sc->encrypt) h->tci_an |= MACSEC_TCI_CONFID; else if (secy->icv_len != DEFAULT_ICV_LEN) h->tci_an |= MACSEC_TCI_C; h->tci_an |= tx_sc->encoding_sa; } static void macsec_set_shortlen(struct macsec_eth_header *h, size_t data_len) { if (data_len < MIN_NON_SHORT_LEN) h->short_length = data_len; } /* Checks if a MACsec interface is being offloaded to an hardware engine */ static bool macsec_is_offloaded(struct macsec_dev *macsec) { if (macsec->offload == MACSEC_OFFLOAD_MAC || macsec->offload == MACSEC_OFFLOAD_PHY) return true; return false; } /* Checks if underlying layers implement MACsec offloading functions. */ static bool macsec_check_offload(enum macsec_offload offload, struct macsec_dev *macsec) { if (!macsec || !macsec->real_dev) return false; if (offload == MACSEC_OFFLOAD_PHY) return macsec->real_dev->phydev && macsec->real_dev->phydev->macsec_ops; else if (offload == MACSEC_OFFLOAD_MAC) return macsec->real_dev->features & NETIF_F_HW_MACSEC && macsec->real_dev->macsec_ops; return false; } static const struct macsec_ops *__macsec_get_ops(enum macsec_offload offload, struct macsec_dev *macsec, struct macsec_context *ctx) { if (ctx) { memset(ctx, 0, sizeof(*ctx)); ctx->offload = offload; if (offload == MACSEC_OFFLOAD_PHY) ctx->phydev = macsec->real_dev->phydev; else if (offload == MACSEC_OFFLOAD_MAC) ctx->netdev = macsec->real_dev; } if (offload == MACSEC_OFFLOAD_PHY) return macsec->real_dev->phydev->macsec_ops; else return macsec->real_dev->macsec_ops; } /* Returns a pointer to the MACsec ops struct if any and updates the MACsec * context device reference if provided. */ static const struct macsec_ops *macsec_get_ops(struct macsec_dev *macsec, struct macsec_context *ctx) { if (!macsec_check_offload(macsec->offload, macsec)) return NULL; return __macsec_get_ops(macsec->offload, macsec, ctx); } /* validate MACsec packet according to IEEE 802.1AE-2018 9.12 */ static bool macsec_validate_skb(struct sk_buff *skb, u16 icv_len, bool xpn) { struct macsec_eth_header *h = (struct macsec_eth_header *)skb->data; int len = skb->len - 2 * ETH_ALEN; int extra_len = macsec_extra_len(!!(h->tci_an & MACSEC_TCI_SC)) + icv_len; /* a) It comprises at least 17 octets */ if (skb->len <= 16) return false; /* b) MACsec EtherType: already checked */ /* c) V bit is clear */ if (h->tci_an & MACSEC_TCI_VERSION) return false; /* d) ES or SCB => !SC */ if ((h->tci_an & MACSEC_TCI_ES || h->tci_an & MACSEC_TCI_SCB) && (h->tci_an & MACSEC_TCI_SC)) return false; /* e) Bits 7 and 8 of octet 4 of the SecTAG are clear */ if (h->unused) return false; /* rx.pn != 0 if not XPN (figure 10-5 with 802.11AEbw-2013 amendment) */ if (!h->packet_number && !xpn) return false; /* length check, f) g) h) i) */ if (h->short_length) return len == extra_len + h->short_length; return len >= extra_len + MIN_NON_SHORT_LEN; } #define MACSEC_NEEDED_HEADROOM (macsec_extra_len(true)) #define MACSEC_NEEDED_TAILROOM MACSEC_STD_ICV_LEN static void macsec_fill_iv_xpn(unsigned char *iv, ssci_t ssci, u64 pn, salt_t salt) { struct gcm_iv_xpn *gcm_iv = (struct gcm_iv_xpn *)iv; gcm_iv->ssci = ssci ^ salt.ssci; gcm_iv->pn = cpu_to_be64(pn) ^ salt.pn; } static void macsec_fill_iv(unsigned char *iv, sci_t sci, u32 pn) { struct gcm_iv *gcm_iv = (struct gcm_iv *)iv; gcm_iv->sci = sci; gcm_iv->pn = htonl(pn); } static struct macsec_eth_header *macsec_ethhdr(struct sk_buff *skb) { return (struct macsec_eth_header *)skb_mac_header(skb); } static void __macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa) { pr_debug("PN wrapped, transitioning to !oper\n"); tx_sa->active = false; if (secy->protect_frames) secy->operational = false; } void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa) { spin_lock_bh(&tx_sa->lock); __macsec_pn_wrapped(secy, tx_sa); spin_unlock_bh(&tx_sa->lock); } EXPORT_SYMBOL_GPL(macsec_pn_wrapped); static pn_t tx_sa_update_pn(struct macsec_tx_sa *tx_sa, struct macsec_secy *secy) { pn_t pn; spin_lock_bh(&tx_sa->lock); pn = tx_sa->next_pn_halves; if (secy->xpn) tx_sa->next_pn++; else tx_sa->next_pn_halves.lower++; if (tx_sa->next_pn == 0) __macsec_pn_wrapped(secy, tx_sa); spin_unlock_bh(&tx_sa->lock); return pn; } static void macsec_encrypt_finish(struct sk_buff *skb, struct net_device *dev) { struct macsec_dev *macsec = netdev_priv(dev); skb->dev = macsec->real_dev; skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; } static unsigned int macsec_msdu_len(struct sk_buff *skb) { struct macsec_dev *macsec = macsec_priv(skb->dev); struct macsec_secy *secy = &macsec->secy; bool sci_present = macsec_skb_cb(skb)->has_sci; return skb->len - macsec_hdr_len(sci_present) - secy->icv_len; } static void macsec_count_tx(struct sk_buff *skb, struct macsec_tx_sc *tx_sc, struct macsec_tx_sa *tx_sa) { unsigned int msdu_len = macsec_msdu_len(skb); struct pcpu_tx_sc_stats *txsc_stats = this_cpu_ptr(tx_sc->stats); u64_stats_update_begin(&txsc_stats->syncp); if (tx_sc->encrypt) { txsc_stats->stats.OutOctetsEncrypted += msdu_len; txsc_stats->stats.OutPktsEncrypted++; this_cpu_inc(tx_sa->stats->OutPktsEncrypted); } else { txsc_stats->stats.OutOctetsProtected += msdu_len; txsc_stats->stats.OutPktsProtected++; this_cpu_inc(tx_sa->stats->OutPktsProtected); } u64_stats_update_end(&txsc_stats->syncp); } static void count_tx(struct net_device *dev, int ret, int len) { if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { struct pcpu_sw_netstats *stats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&stats->syncp); stats->tx_packets++; stats->tx_bytes += len; u64_stats_update_end(&stats->syncp); } } static void macsec_encrypt_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; struct net_device *dev = skb->dev; struct macsec_dev *macsec = macsec_priv(dev); struct macsec_tx_sa *sa = macsec_skb_cb(skb)->tx_sa; int len, ret; aead_request_free(macsec_skb_cb(skb)->req); rcu_read_lock_bh(); macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa); /* packet is encrypted/protected so tx_bytes must be calculated */ len = macsec_msdu_len(skb) + 2 * ETH_ALEN; macsec_encrypt_finish(skb, dev); ret = dev_queue_xmit(skb); count_tx(dev, ret, len); rcu_read_unlock_bh(); macsec_txsa_put(sa); dev_put(dev); } static struct aead_request *macsec_alloc_req(struct crypto_aead *tfm, unsigned char **iv, struct scatterlist **sg, int num_frags) { size_t size, iv_offset, sg_offset; struct aead_request *req; void *tmp; size = sizeof(struct aead_request) + crypto_aead_reqsize(tfm); iv_offset = size; size += GCM_AES_IV_LEN; size = ALIGN(size, __alignof__(struct scatterlist)); sg_offset = size; size += sizeof(struct scatterlist) * num_frags; tmp = kmalloc(size, GFP_ATOMIC); if (!tmp) return NULL; *iv = (unsigned char *)(tmp + iv_offset); *sg = (struct scatterlist *)(tmp + sg_offset); req = tmp; aead_request_set_tfm(req, tfm); return req; } static struct sk_buff *macsec_encrypt(struct sk_buff *skb, struct net_device *dev) { int ret; struct scatterlist *sg; struct sk_buff *trailer; unsigned char *iv; struct ethhdr *eth; struct macsec_eth_header *hh; size_t unprotected_len; struct aead_request *req; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; struct macsec_dev *macsec = macsec_priv(dev); bool sci_present; pn_t pn; secy = &macsec->secy; tx_sc = &secy->tx_sc; /* 10.5.1 TX SA assignment */ tx_sa = macsec_txsa_get(tx_sc->sa[tx_sc->encoding_sa]); if (!tx_sa) { secy->operational = false; kfree_skb(skb); return ERR_PTR(-EINVAL); } if (unlikely(skb_headroom(skb) < MACSEC_NEEDED_HEADROOM || skb_tailroom(skb) < MACSEC_NEEDED_TAILROOM)) { struct sk_buff *nskb = skb_copy_expand(skb, MACSEC_NEEDED_HEADROOM, MACSEC_NEEDED_TAILROOM, GFP_ATOMIC); if (likely(nskb)) { consume_skb(skb); skb = nskb; } else { macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(-ENOMEM); } } else { skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) { macsec_txsa_put(tx_sa); return ERR_PTR(-ENOMEM); } } unprotected_len = skb->len; eth = eth_hdr(skb); sci_present = send_sci(secy); hh = skb_push(skb, macsec_extra_len(sci_present)); memmove(hh, eth, 2 * ETH_ALEN); pn = tx_sa_update_pn(tx_sa, secy); if (pn.full64 == 0) { macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(-ENOLINK); } macsec_fill_sectag(hh, secy, pn.lower, sci_present); macsec_set_shortlen(hh, unprotected_len - 2 * ETH_ALEN); skb_put(skb, secy->icv_len); if (skb->len - ETH_HLEN > macsec_priv(dev)->real_dev->mtu) { struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats); u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.OutPktsTooLong++; u64_stats_update_end(&secy_stats->syncp); macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(-EINVAL); } ret = skb_cow_data(skb, 0, &trailer); if (unlikely(ret < 0)) { macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(ret); } req = macsec_alloc_req(tx_sa->key.tfm, &iv, &sg, ret); if (!req) { macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(-ENOMEM); } if (secy->xpn) macsec_fill_iv_xpn(iv, tx_sa->ssci, pn.full64, tx_sa->key.salt); else macsec_fill_iv(iv, secy->sci, pn.lower); sg_init_table(sg, ret); ret = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(ret < 0)) { aead_request_free(req); macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(ret); } if (tx_sc->encrypt) { int len = skb->len - macsec_hdr_len(sci_present) - secy->icv_len; aead_request_set_crypt(req, sg, sg, len, iv); aead_request_set_ad(req, macsec_hdr_len(sci_present)); } else { aead_request_set_crypt(req, sg, sg, 0, iv); aead_request_set_ad(req, skb->len - secy->icv_len); } macsec_skb_cb(skb)->req = req; macsec_skb_cb(skb)->tx_sa = tx_sa; macsec_skb_cb(skb)->has_sci = sci_present; aead_request_set_callback(req, 0, macsec_encrypt_done, skb); dev_hold(skb->dev); ret = crypto_aead_encrypt(req); if (ret == -EINPROGRESS) { return ERR_PTR(ret); } else if (ret != 0) { dev_put(skb->dev); kfree_skb(skb); aead_request_free(req); macsec_txsa_put(tx_sa); return ERR_PTR(-EINVAL); } dev_put(skb->dev); aead_request_free(req); macsec_txsa_put(tx_sa); return skb; } static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u32 pn) { struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; struct pcpu_rx_sc_stats *rxsc_stats = this_cpu_ptr(rx_sa->sc->stats); struct macsec_eth_header *hdr = macsec_ethhdr(skb); u32 lowest_pn = 0; spin_lock(&rx_sa->lock); if (rx_sa->next_pn_halves.lower >= secy->replay_window) lowest_pn = rx_sa->next_pn_halves.lower - secy->replay_window; /* Now perform replay protection check again * (see IEEE 802.1AE-2006 figure 10-5) */ if (secy->replay_protect && pn < lowest_pn && (!secy->xpn || pn_same_half(pn, lowest_pn))) { spin_unlock(&rx_sa->lock); u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsLate++; u64_stats_update_end(&rxsc_stats->syncp); DEV_STATS_INC(secy->netdev, rx_dropped); return false; } if (secy->validate_frames != MACSEC_VALIDATE_DISABLED) { unsigned int msdu_len = macsec_msdu_len(skb); u64_stats_update_begin(&rxsc_stats->syncp); if (hdr->tci_an & MACSEC_TCI_E) rxsc_stats->stats.InOctetsDecrypted += msdu_len; else rxsc_stats->stats.InOctetsValidated += msdu_len; u64_stats_update_end(&rxsc_stats->syncp); } if (!macsec_skb_cb(skb)->valid) { spin_unlock(&rx_sa->lock); /* 10.6.5 */ if (hdr->tci_an & MACSEC_TCI_C || secy->validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsNotValid++; u64_stats_update_end(&rxsc_stats->syncp); this_cpu_inc(rx_sa->stats->InPktsNotValid); DEV_STATS_INC(secy->netdev, rx_errors); return false; } u64_stats_update_begin(&rxsc_stats->syncp); if (secy->validate_frames == MACSEC_VALIDATE_CHECK) { rxsc_stats->stats.InPktsInvalid++; this_cpu_inc(rx_sa->stats->InPktsInvalid); } else if (pn < lowest_pn) { rxsc_stats->stats.InPktsDelayed++; } else { rxsc_stats->stats.InPktsUnchecked++; } u64_stats_update_end(&rxsc_stats->syncp); } else { u64_stats_update_begin(&rxsc_stats->syncp); if (pn < lowest_pn) { rxsc_stats->stats.InPktsDelayed++; } else { rxsc_stats->stats.InPktsOK++; this_cpu_inc(rx_sa->stats->InPktsOK); } u64_stats_update_end(&rxsc_stats->syncp); // Instead of "pn >=" - to support pn overflow in xpn if (pn + 1 > rx_sa->next_pn_halves.lower) { rx_sa->next_pn_halves.lower = pn + 1; } else if (secy->xpn && !pn_same_half(pn, rx_sa->next_pn_halves.lower)) { rx_sa->next_pn_halves.upper++; rx_sa->next_pn_halves.lower = pn + 1; } spin_unlock(&rx_sa->lock); } return true; } static void macsec_reset_skb(struct sk_buff *skb, struct net_device *dev) { skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, dev); skb_reset_network_header(skb); if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); skb_reset_mac_len(skb); } static void macsec_finalize_skb(struct sk_buff *skb, u8 icv_len, u8 hdr_len) { skb->ip_summed = CHECKSUM_NONE; memmove(skb->data + hdr_len, skb->data, 2 * ETH_ALEN); skb_pull(skb, hdr_len); pskb_trim_unique(skb, skb->len - icv_len); } static void count_rx(struct net_device *dev, int len) { struct pcpu_sw_netstats *stats = this_cpu_ptr(dev->tstats); u64_stats_update_begin(&stats->syncp); stats->rx_packets++; stats->rx_bytes += len; u64_stats_update_end(&stats->syncp); } static void macsec_decrypt_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; struct net_device *dev = skb->dev; struct macsec_dev *macsec = macsec_priv(dev); struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; struct macsec_rx_sc *rx_sc = rx_sa->sc; int len; u32 pn; aead_request_free(macsec_skb_cb(skb)->req); if (!err) macsec_skb_cb(skb)->valid = true; rcu_read_lock_bh(); pn = ntohl(macsec_ethhdr(skb)->packet_number); if (!macsec_post_decrypt(skb, &macsec->secy, pn)) { rcu_read_unlock_bh(); kfree_skb(skb); goto out; } macsec_finalize_skb(skb, macsec->secy.icv_len, macsec_extra_len(macsec_skb_cb(skb)->has_sci)); len = skb->len; macsec_reset_skb(skb, macsec->secy.netdev); if (gro_cells_receive(&macsec->gro_cells, skb) == NET_RX_SUCCESS) count_rx(dev, len); rcu_read_unlock_bh(); out: macsec_rxsa_put(rx_sa); macsec_rxsc_put(rx_sc); dev_put(dev); } static struct sk_buff *macsec_decrypt(struct sk_buff *skb, struct net_device *dev, struct macsec_rx_sa *rx_sa, sci_t sci, struct macsec_secy *secy) { int ret; struct scatterlist *sg; struct sk_buff *trailer; unsigned char *iv; struct aead_request *req; struct macsec_eth_header *hdr; u32 hdr_pn; u16 icv_len = secy->icv_len; macsec_skb_cb(skb)->valid = false; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return ERR_PTR(-ENOMEM); ret = skb_cow_data(skb, 0, &trailer); if (unlikely(ret < 0)) { kfree_skb(skb); return ERR_PTR(ret); } req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg, ret); if (!req) { kfree_skb(skb); return ERR_PTR(-ENOMEM); } hdr = (struct macsec_eth_header *)skb->data; hdr_pn = ntohl(hdr->packet_number); if (secy->xpn) { pn_t recovered_pn = rx_sa->next_pn_halves; recovered_pn.lower = hdr_pn; if (hdr_pn < rx_sa->next_pn_halves.lower && !pn_same_half(hdr_pn, rx_sa->next_pn_halves.lower)) recovered_pn.upper++; macsec_fill_iv_xpn(iv, rx_sa->ssci, recovered_pn.full64, rx_sa->key.salt); } else { macsec_fill_iv(iv, sci, hdr_pn); } sg_init_table(sg, ret); ret = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(ret < 0)) { aead_request_free(req); kfree_skb(skb); return ERR_PTR(ret); } if (hdr->tci_an & MACSEC_TCI_E) { /* confidentiality: ethernet + macsec header * authenticated, encrypted payload */ int len = skb->len - macsec_hdr_len(macsec_skb_cb(skb)->has_sci); aead_request_set_crypt(req, sg, sg, len, iv); aead_request_set_ad(req, macsec_hdr_len(macsec_skb_cb(skb)->has_sci)); skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) { aead_request_free(req); return ERR_PTR(-ENOMEM); } } else { /* integrity only: all headers + data authenticated */ aead_request_set_crypt(req, sg, sg, icv_len, iv); aead_request_set_ad(req, skb->len - icv_len); } macsec_skb_cb(skb)->req = req; skb->dev = dev; aead_request_set_callback(req, 0, macsec_decrypt_done, skb); dev_hold(dev); ret = crypto_aead_decrypt(req); if (ret == -EINPROGRESS) { return ERR_PTR(ret); } else if (ret != 0) { /* decryption/authentication failed * 10.6 if validateFrames is disabled, deliver anyway */ if (ret != -EBADMSG) { kfree_skb(skb); skb = ERR_PTR(ret); } } else { macsec_skb_cb(skb)->valid = true; } dev_put(dev); aead_request_free(req); return skb; } static struct macsec_rx_sc *find_rx_sc(struct macsec_secy *secy, sci_t sci) { struct macsec_rx_sc *rx_sc; for_each_rxsc(secy, rx_sc) { if (rx_sc->sci == sci) return rx_sc; } return NULL; } static struct macsec_rx_sc *find_rx_sc_rtnl(struct macsec_secy *secy, sci_t sci) { struct macsec_rx_sc *rx_sc; for_each_rxsc_rtnl(secy, rx_sc) { if (rx_sc->sci == sci) return rx_sc; } return NULL; } static enum rx_handler_result handle_not_macsec(struct sk_buff *skb) { /* Deliver to the uncontrolled port by default */ enum rx_handler_result ret = RX_HANDLER_PASS; struct ethhdr *hdr = eth_hdr(skb); struct macsec_rxh_data *rxd; struct macsec_dev *macsec; rcu_read_lock(); rxd = macsec_data_rcu(skb->dev); list_for_each_entry_rcu(macsec, &rxd->secys, secys) { struct sk_buff *nskb; struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats); struct net_device *ndev = macsec->secy.netdev; /* If h/w offloading is enabled, HW decodes frames and strips * the SecTAG, so we have to deduce which port to deliver to. */ if (macsec_is_offloaded(macsec) && netif_running(ndev)) { if (ether_addr_equal_64bits(hdr->h_dest, ndev->dev_addr)) { /* exact match, divert skb to this port */ skb->dev = ndev; skb->pkt_type = PACKET_HOST; ret = RX_HANDLER_ANOTHER; goto out; } else if (is_multicast_ether_addr_64bits( hdr->h_dest)) { /* multicast frame, deliver on this port too */ nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) break; nskb->dev = ndev; if (ether_addr_equal_64bits(hdr->h_dest, ndev->broadcast)) nskb->pkt_type = PACKET_BROADCAST; else nskb->pkt_type = PACKET_MULTICAST; netif_rx(nskb); } continue; } /* 10.6 If the management control validateFrames is not * Strict, frames without a SecTAG are received, counted, and * delivered to the Controlled Port */ if (macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsNoTag++; u64_stats_update_end(&secy_stats->syncp); DEV_STATS_INC(macsec->secy.netdev, rx_dropped); continue; } /* deliver on this port */ nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) break; nskb->dev = ndev; if (netif_rx(nskb) == NET_RX_SUCCESS) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsUntagged++; u64_stats_update_end(&secy_stats->syncp); } } out: rcu_read_unlock(); return ret; } static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct net_device *dev = skb->dev; struct macsec_eth_header *hdr; struct macsec_secy *secy = NULL; struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; struct macsec_rxh_data *rxd; struct macsec_dev *macsec; unsigned int len; sci_t sci; u32 hdr_pn; bool cbit; struct pcpu_rx_sc_stats *rxsc_stats; struct pcpu_secy_stats *secy_stats; bool pulled_sci; int ret; if (skb_headroom(skb) < ETH_HLEN) goto drop_direct; hdr = macsec_ethhdr(skb); if (hdr->eth.h_proto != htons(ETH_P_MACSEC)) return handle_not_macsec(skb); skb = skb_unshare(skb, GFP_ATOMIC); *pskb = skb; if (!skb) return RX_HANDLER_CONSUMED; pulled_sci = pskb_may_pull(skb, macsec_extra_len(true)); if (!pulled_sci) { if (!pskb_may_pull(skb, macsec_extra_len(false))) goto drop_direct; } hdr = macsec_ethhdr(skb); /* Frames with a SecTAG that has the TCI E bit set but the C * bit clear are discarded, as this reserved encoding is used * to identify frames with a SecTAG that are not to be * delivered to the Controlled Port. */ if ((hdr->tci_an & (MACSEC_TCI_C | MACSEC_TCI_E)) == MACSEC_TCI_E) return RX_HANDLER_PASS; /* now, pull the extra length */ if (hdr->tci_an & MACSEC_TCI_SC) { if (!pulled_sci) goto drop_direct; } /* ethernet header is part of crypto processing */ skb_push(skb, ETH_HLEN); macsec_skb_cb(skb)->has_sci = !!(hdr->tci_an & MACSEC_TCI_SC); macsec_skb_cb(skb)->assoc_num = hdr->tci_an & MACSEC_AN_MASK; sci = macsec_frame_sci(hdr, macsec_skb_cb(skb)->has_sci); rcu_read_lock(); rxd = macsec_data_rcu(skb->dev); list_for_each_entry_rcu(macsec, &rxd->secys, secys) { struct macsec_rx_sc *sc = find_rx_sc(&macsec->secy, sci); sc = sc ? macsec_rxsc_get(sc) : NULL; if (sc) { secy = &macsec->secy; rx_sc = sc; break; } } if (!secy) goto nosci; dev = secy->netdev; macsec = macsec_priv(dev); secy_stats = this_cpu_ptr(macsec->stats); rxsc_stats = this_cpu_ptr(rx_sc->stats); if (!macsec_validate_skb(skb, secy->icv_len, secy->xpn)) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsBadTag++; u64_stats_update_end(&secy_stats->syncp); DEV_STATS_INC(secy->netdev, rx_errors); goto drop_nosa; } rx_sa = macsec_rxsa_get(rx_sc->sa[macsec_skb_cb(skb)->assoc_num]); if (!rx_sa) { /* 10.6.1 if the SA is not in use */ /* If validateFrames is Strict or the C bit in the * SecTAG is set, discard */ if (hdr->tci_an & MACSEC_TCI_C || secy->validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsNotUsingSA++; u64_stats_update_end(&rxsc_stats->syncp); DEV_STATS_INC(secy->netdev, rx_errors); goto drop_nosa; } /* not Strict, the frame (with the SecTAG and ICV * removed) is delivered to the Controlled Port. */ u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsUnusedSA++; u64_stats_update_end(&rxsc_stats->syncp); goto deliver; } /* First, PN check to avoid decrypting obviously wrong packets */ hdr_pn = ntohl(hdr->packet_number); if (secy->replay_protect) { bool late; spin_lock(&rx_sa->lock); late = rx_sa->next_pn_halves.lower >= secy->replay_window && hdr_pn < (rx_sa->next_pn_halves.lower - secy->replay_window); if (secy->xpn) late = late && pn_same_half(rx_sa->next_pn_halves.lower, hdr_pn); spin_unlock(&rx_sa->lock); if (late) { u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsLate++; u64_stats_update_end(&rxsc_stats->syncp); DEV_STATS_INC(macsec->secy.netdev, rx_dropped); goto drop; } } macsec_skb_cb(skb)->rx_sa = rx_sa; /* Disabled && !changed text => skip validation */ if (hdr->tci_an & MACSEC_TCI_C || secy->validate_frames != MACSEC_VALIDATE_DISABLED) skb = macsec_decrypt(skb, dev, rx_sa, sci, secy); if (IS_ERR(skb)) { /* the decrypt callback needs the reference */ if (PTR_ERR(skb) != -EINPROGRESS) { macsec_rxsa_put(rx_sa); macsec_rxsc_put(rx_sc); } rcu_read_unlock(); *pskb = NULL; return RX_HANDLER_CONSUMED; } if (!macsec_post_decrypt(skb, secy, hdr_pn)) goto drop; deliver: macsec_finalize_skb(skb, secy->icv_len, macsec_extra_len(macsec_skb_cb(skb)->has_sci)); len = skb->len; macsec_reset_skb(skb, secy->netdev); if (rx_sa) macsec_rxsa_put(rx_sa); macsec_rxsc_put(rx_sc); skb_orphan(skb); ret = gro_cells_receive(&macsec->gro_cells, skb); if (ret == NET_RX_SUCCESS) count_rx(dev, len); else DEV_STATS_INC(macsec->secy.netdev, rx_dropped); rcu_read_unlock(); *pskb = NULL; return RX_HANDLER_CONSUMED; drop: macsec_rxsa_put(rx_sa); drop_nosa: macsec_rxsc_put(rx_sc); rcu_read_unlock(); drop_direct: kfree_skb(skb); *pskb = NULL; return RX_HANDLER_CONSUMED; nosci: /* 10.6.1 if the SC is not found */ cbit = !!(hdr->tci_an & MACSEC_TCI_C); if (!cbit) macsec_finalize_skb(skb, DEFAULT_ICV_LEN, macsec_extra_len(macsec_skb_cb(skb)->has_sci)); list_for_each_entry_rcu(macsec, &rxd->secys, secys) { struct sk_buff *nskb; secy_stats = this_cpu_ptr(macsec->stats); /* If validateFrames is Strict or the C bit in the * SecTAG is set, discard */ if (cbit || macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsNoSCI++; u64_stats_update_end(&secy_stats->syncp); DEV_STATS_INC(macsec->secy.netdev, rx_errors); continue; } /* not strict, the frame (with the SecTAG and ICV * removed) is delivered to the Controlled Port. */ nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) break; macsec_reset_skb(nskb, macsec->secy.netdev); ret = netif_rx(nskb); if (ret == NET_RX_SUCCESS) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsUnknownSCI++; u64_stats_update_end(&secy_stats->syncp); } else { DEV_STATS_INC(macsec->secy.netdev, rx_dropped); } } rcu_read_unlock(); *pskb = skb; return RX_HANDLER_PASS; } static struct crypto_aead *macsec_alloc_tfm(char *key, int key_len, int icv_len) { struct crypto_aead *tfm; int ret; tfm = crypto_alloc_aead("gcm(aes)", 0, 0); if (IS_ERR(tfm)) return tfm; ret = crypto_aead_setkey(tfm, key, key_len); if (ret < 0) goto fail; ret = crypto_aead_setauthsize(tfm, icv_len); if (ret < 0) goto fail; return tfm; fail: crypto_free_aead(tfm); return ERR_PTR(ret); } static int init_rx_sa(struct macsec_rx_sa *rx_sa, char *sak, int key_len, int icv_len) { rx_sa->stats = alloc_percpu(struct macsec_rx_sa_stats); if (!rx_sa->stats) return -ENOMEM; rx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len); if (IS_ERR(rx_sa->key.tfm)) { free_percpu(rx_sa->stats); return PTR_ERR(rx_sa->key.tfm); } rx_sa->ssci = MACSEC_UNDEF_SSCI; rx_sa->active = false; rx_sa->next_pn = 1; refcount_set(&rx_sa->refcnt, 1); spin_lock_init(&rx_sa->lock); return 0; } static void clear_rx_sa(struct macsec_rx_sa *rx_sa) { rx_sa->active = false; macsec_rxsa_put(rx_sa); } static void free_rx_sc(struct macsec_rx_sc *rx_sc) { int i; for (i = 0; i < MACSEC_NUM_AN; i++) { struct macsec_rx_sa *sa = rtnl_dereference(rx_sc->sa[i]); RCU_INIT_POINTER(rx_sc->sa[i], NULL); if (sa) clear_rx_sa(sa); } macsec_rxsc_put(rx_sc); } static struct macsec_rx_sc *del_rx_sc(struct macsec_secy *secy, sci_t sci) { struct macsec_rx_sc *rx_sc, __rcu **rx_scp; for (rx_scp = &secy->rx_sc, rx_sc = rtnl_dereference(*rx_scp); rx_sc; rx_scp = &rx_sc->next, rx_sc = rtnl_dereference(*rx_scp)) { if (rx_sc->sci == sci) { if (rx_sc->active) secy->n_rx_sc--; rcu_assign_pointer(*rx_scp, rx_sc->next); return rx_sc; } } return NULL; } static struct macsec_rx_sc *create_rx_sc(struct net_device *dev, sci_t sci, bool active) { struct macsec_rx_sc *rx_sc; struct macsec_dev *macsec; struct net_device *real_dev = macsec_priv(dev)->real_dev; struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); struct macsec_secy *secy; list_for_each_entry(macsec, &rxd->secys, secys) { if (find_rx_sc_rtnl(&macsec->secy, sci)) return ERR_PTR(-EEXIST); } rx_sc = kzalloc(sizeof(*rx_sc), GFP_KERNEL); if (!rx_sc) return ERR_PTR(-ENOMEM); rx_sc->stats = netdev_alloc_pcpu_stats(struct pcpu_rx_sc_stats); if (!rx_sc->stats) { kfree(rx_sc); return ERR_PTR(-ENOMEM); } rx_sc->sci = sci; rx_sc->active = active; refcount_set(&rx_sc->refcnt, 1); secy = &macsec_priv(dev)->secy; rcu_assign_pointer(rx_sc->next, secy->rx_sc); rcu_assign_pointer(secy->rx_sc, rx_sc); if (rx_sc->active) secy->n_rx_sc++; return rx_sc; } static int init_tx_sa(struct macsec_tx_sa *tx_sa, char *sak, int key_len, int icv_len) { tx_sa->stats = alloc_percpu(struct macsec_tx_sa_stats); if (!tx_sa->stats) return -ENOMEM; tx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len); if (IS_ERR(tx_sa->key.tfm)) { free_percpu(tx_sa->stats); return PTR_ERR(tx_sa->key.tfm); } tx_sa->ssci = MACSEC_UNDEF_SSCI; tx_sa->active = false; refcount_set(&tx_sa->refcnt, 1); spin_lock_init(&tx_sa->lock); return 0; } static void clear_tx_sa(struct macsec_tx_sa *tx_sa) { tx_sa->active = false; macsec_txsa_put(tx_sa); } static struct genl_family macsec_fam; static struct net_device *get_dev_from_nl(struct net *net, struct nlattr **attrs) { int ifindex = nla_get_u32(attrs[MACSEC_ATTR_IFINDEX]); struct net_device *dev; dev = __dev_get_by_index(net, ifindex); if (!dev) return ERR_PTR(-ENODEV); if (!netif_is_macsec(dev)) return ERR_PTR(-ENODEV); return dev; } static enum macsec_offload nla_get_offload(const struct nlattr *nla) { return (__force enum macsec_offload)nla_get_u8(nla); } static sci_t nla_get_sci(const struct nlattr *nla) { return (__force sci_t)nla_get_u64(nla); } static int nla_put_sci(struct sk_buff *skb, int attrtype, sci_t value, int padattr) { return nla_put_u64_64bit(skb, attrtype, (__force u64)value, padattr); } static ssci_t nla_get_ssci(const struct nlattr *nla) { return (__force ssci_t)nla_get_u32(nla); } static int nla_put_ssci(struct sk_buff *skb, int attrtype, ssci_t value) { return nla_put_u32(skb, attrtype, (__force u64)value); } static struct macsec_tx_sa *get_txsa_from_nl(struct net *net, struct nlattr **attrs, struct nlattr **tb_sa, struct net_device **devp, struct macsec_secy **secyp, struct macsec_tx_sc **scp, u8 *assoc_num) { struct net_device *dev; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; if (!tb_sa[MACSEC_SA_ATTR_AN]) return ERR_PTR(-EINVAL); *assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); dev = get_dev_from_nl(net, attrs); if (IS_ERR(dev)) return ERR_CAST(dev); if (*assoc_num >= MACSEC_NUM_AN) return ERR_PTR(-EINVAL); secy = &macsec_priv(dev)->secy; tx_sc = &secy->tx_sc; tx_sa = rtnl_dereference(tx_sc->sa[*assoc_num]); if (!tx_sa) return ERR_PTR(-ENODEV); *devp = dev; *scp = tx_sc; *secyp = secy; return tx_sa; } static struct macsec_rx_sc *get_rxsc_from_nl(struct net *net, struct nlattr **attrs, struct nlattr **tb_rxsc, struct net_device **devp, struct macsec_secy **secyp) { struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; sci_t sci; dev = get_dev_from_nl(net, attrs); if (IS_ERR(dev)) return ERR_CAST(dev); secy = &macsec_priv(dev)->secy; if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI]) return ERR_PTR(-EINVAL); sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); rx_sc = find_rx_sc_rtnl(secy, sci); if (!rx_sc) return ERR_PTR(-ENODEV); *secyp = secy; *devp = dev; return rx_sc; } static struct macsec_rx_sa *get_rxsa_from_nl(struct net *net, struct nlattr **attrs, struct nlattr **tb_rxsc, struct nlattr **tb_sa, struct net_device **devp, struct macsec_secy **secyp, struct macsec_rx_sc **scp, u8 *assoc_num) { struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; if (!tb_sa[MACSEC_SA_ATTR_AN]) return ERR_PTR(-EINVAL); *assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); if (*assoc_num >= MACSEC_NUM_AN) return ERR_PTR(-EINVAL); rx_sc = get_rxsc_from_nl(net, attrs, tb_rxsc, devp, secyp); if (IS_ERR(rx_sc)) return ERR_CAST(rx_sc); rx_sa = rtnl_dereference(rx_sc->sa[*assoc_num]); if (!rx_sa) return ERR_PTR(-ENODEV); *scp = rx_sc; return rx_sa; } static const struct nla_policy macsec_genl_policy[NUM_MACSEC_ATTR] = { [MACSEC_ATTR_IFINDEX] = { .type = NLA_U32 }, [MACSEC_ATTR_RXSC_CONFIG] = { .type = NLA_NESTED }, [MACSEC_ATTR_SA_CONFIG] = { .type = NLA_NESTED }, [MACSEC_ATTR_OFFLOAD] = { .type = NLA_NESTED }, }; static const struct nla_policy macsec_genl_rxsc_policy[NUM_MACSEC_RXSC_ATTR] = { [MACSEC_RXSC_ATTR_SCI] = { .type = NLA_U64 }, [MACSEC_RXSC_ATTR_ACTIVE] = { .type = NLA_U8 }, }; static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = { [MACSEC_SA_ATTR_AN] = { .type = NLA_U8 }, [MACSEC_SA_ATTR_ACTIVE] = { .type = NLA_U8 }, [MACSEC_SA_ATTR_PN] = NLA_POLICY_MIN_LEN(4), [MACSEC_SA_ATTR_KEYID] = { .type = NLA_BINARY, .len = MACSEC_KEYID_LEN, }, [MACSEC_SA_ATTR_KEY] = { .type = NLA_BINARY, .len = MACSEC_MAX_KEY_LEN, }, [MACSEC_SA_ATTR_SSCI] = { .type = NLA_U32 }, [MACSEC_SA_ATTR_SALT] = { .type = NLA_BINARY, .len = MACSEC_SALT_LEN, }, }; static const struct nla_policy macsec_genl_offload_policy[NUM_MACSEC_OFFLOAD_ATTR] = { [MACSEC_OFFLOAD_ATTR_TYPE] = { .type = NLA_U8 }, }; /* Offloads an operation to a device driver */ static int macsec_offload(int (* const func)(struct macsec_context *), struct macsec_context *ctx) { int ret; if (unlikely(!func)) return 0; if (ctx->offload == MACSEC_OFFLOAD_PHY) mutex_lock(&ctx->phydev->lock); /* Phase I: prepare. The drive should fail here if there are going to be * issues in the commit phase. */ ctx->prepare = true; ret = (*func)(ctx); if (ret) goto phy_unlock; /* Phase II: commit. This step cannot fail. */ ctx->prepare = false; ret = (*func)(ctx); /* This should never happen: commit is not allowed to fail */ if (unlikely(ret)) WARN(1, "MACsec offloading commit failed (%d)\n", ret); phy_unlock: if (ctx->offload == MACSEC_OFFLOAD_PHY) mutex_unlock(&ctx->phydev->lock); return ret; } static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa) { if (!attrs[MACSEC_ATTR_SA_CONFIG]) return -EINVAL; if (nla_parse_nested_deprecated(tb_sa, MACSEC_SA_ATTR_MAX, attrs[MACSEC_ATTR_SA_CONFIG], macsec_genl_sa_policy, NULL)) return -EINVAL; return 0; } static int parse_rxsc_config(struct nlattr **attrs, struct nlattr **tb_rxsc) { if (!attrs[MACSEC_ATTR_RXSC_CONFIG]) return -EINVAL; if (nla_parse_nested_deprecated(tb_rxsc, MACSEC_RXSC_ATTR_MAX, attrs[MACSEC_ATTR_RXSC_CONFIG], macsec_genl_rxsc_policy, NULL)) return -EINVAL; return 0; } static bool validate_add_rxsa(struct nlattr **attrs) { if (!attrs[MACSEC_SA_ATTR_AN] || !attrs[MACSEC_SA_ATTR_KEY] || !attrs[MACSEC_SA_ATTR_KEYID]) return false; if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) return false; if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) return false; if (attrs[MACSEC_SA_ATTR_ACTIVE]) { if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) return false; } if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN) return false; return true; } static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct nlattr **attrs = info->attrs; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; unsigned char assoc_num; int pn_len; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; int err; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (!validate_add_rxsa(tb_sa)) return -EINVAL; rtnl_lock(); rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy); if (IS_ERR(rx_sc)) { rtnl_unlock(); return PTR_ERR(rx_sc); } assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) { pr_notice("macsec: nl: add_rxsa: bad key length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); rtnl_unlock(); return -EINVAL; } pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; if (tb_sa[MACSEC_SA_ATTR_PN] && nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: add_rxsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); return -EINVAL; } if (secy->xpn) { if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) { rtnl_unlock(); return -EINVAL; } if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) { pr_notice("macsec: nl: add_rxsa: bad salt length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_SALT]), MACSEC_SALT_LEN); rtnl_unlock(); return -EINVAL; } } rx_sa = rtnl_dereference(rx_sc->sa[assoc_num]); if (rx_sa) { rtnl_unlock(); return -EBUSY; } rx_sa = kmalloc(sizeof(*rx_sa), GFP_KERNEL); if (!rx_sa) { rtnl_unlock(); return -ENOMEM; } err = init_rx_sa(rx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len, secy->icv_len); if (err < 0) { kfree(rx_sa); rtnl_unlock(); return err; } if (tb_sa[MACSEC_SA_ATTR_PN]) { spin_lock_bh(&rx_sa->lock); rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&rx_sa->lock); } if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) rx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); rx_sa->sc = rx_sc; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { err = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.rx_sa = rx_sa; ctx.secy = secy; memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); err = macsec_offload(ops->mdo_add_rxsa, &ctx); memzero_explicit(ctx.sa.key, secy->key_len); if (err) goto cleanup; } if (secy->xpn) { rx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]); nla_memcpy(rx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT], MACSEC_SALT_LEN); } nla_memcpy(rx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); rcu_assign_pointer(rx_sc->sa[assoc_num], rx_sa); rtnl_unlock(); return 0; cleanup: macsec_rxsa_put(rx_sa); rtnl_unlock(); return err; } static bool validate_add_rxsc(struct nlattr **attrs) { if (!attrs[MACSEC_RXSC_ATTR_SCI]) return false; if (attrs[MACSEC_RXSC_ATTR_ACTIVE]) { if (nla_get_u8(attrs[MACSEC_RXSC_ATTR_ACTIVE]) > 1) return false; } return true; } static int macsec_add_rxsc(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; sci_t sci = MACSEC_UNDEF_SCI; struct nlattr **attrs = info->attrs; struct macsec_rx_sc *rx_sc; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct macsec_secy *secy; bool active = true; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (!validate_add_rxsc(tb_rxsc)) return -EINVAL; rtnl_lock(); dev = get_dev_from_nl(genl_info_net(info), attrs); if (IS_ERR(dev)) { rtnl_unlock(); return PTR_ERR(dev); } secy = &macsec_priv(dev)->secy; sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) active = nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]); rx_sc = create_rx_sc(dev, sci, active); if (IS_ERR(rx_sc)) { rtnl_unlock(); return PTR_ERR(rx_sc); } if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.rx_sc = rx_sc; ctx.secy = secy; ret = macsec_offload(ops->mdo_add_rxsc, &ctx); if (ret) goto cleanup; } rtnl_unlock(); return 0; cleanup: del_rx_sc(secy, sci); free_rx_sc(rx_sc); rtnl_unlock(); return ret; } static bool validate_add_txsa(struct nlattr **attrs) { if (!attrs[MACSEC_SA_ATTR_AN] || !attrs[MACSEC_SA_ATTR_PN] || !attrs[MACSEC_SA_ATTR_KEY] || !attrs[MACSEC_SA_ATTR_KEYID]) return false; if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) return false; if (nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) return false; if (attrs[MACSEC_SA_ATTR_ACTIVE]) { if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) return false; } if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN) return false; return true; } static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct nlattr **attrs = info->attrs; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; unsigned char assoc_num; int pn_len; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; bool was_operational; int err; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (!validate_add_txsa(tb_sa)) return -EINVAL; rtnl_lock(); dev = get_dev_from_nl(genl_info_net(info), attrs); if (IS_ERR(dev)) { rtnl_unlock(); return PTR_ERR(dev); } secy = &macsec_priv(dev)->secy; tx_sc = &secy->tx_sc; assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) { pr_notice("macsec: nl: add_txsa: bad key length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); rtnl_unlock(); return -EINVAL; } pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: add_txsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); return -EINVAL; } if (secy->xpn) { if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) { rtnl_unlock(); return -EINVAL; } if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) { pr_notice("macsec: nl: add_txsa: bad salt length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_SALT]), MACSEC_SALT_LEN); rtnl_unlock(); return -EINVAL; } } tx_sa = rtnl_dereference(tx_sc->sa[assoc_num]); if (tx_sa) { rtnl_unlock(); return -EBUSY; } tx_sa = kmalloc(sizeof(*tx_sa), GFP_KERNEL); if (!tx_sa) { rtnl_unlock(); return -ENOMEM; } err = init_tx_sa(tx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len, secy->icv_len); if (err < 0) { kfree(tx_sa); rtnl_unlock(); return err; } spin_lock_bh(&tx_sa->lock); tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&tx_sa->lock); if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) tx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); was_operational = secy->operational; if (assoc_num == tx_sc->encoding_sa && tx_sa->active) secy->operational = true; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { err = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.tx_sa = tx_sa; ctx.secy = secy; memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); err = macsec_offload(ops->mdo_add_txsa, &ctx); memzero_explicit(ctx.sa.key, secy->key_len); if (err) goto cleanup; } if (secy->xpn) { tx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]); nla_memcpy(tx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT], MACSEC_SALT_LEN); } nla_memcpy(tx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); rcu_assign_pointer(tx_sc->sa[assoc_num], tx_sa); rtnl_unlock(); return 0; cleanup: secy->operational = was_operational; macsec_txsa_put(tx_sa); rtnl_unlock(); return err; } static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; u8 assoc_num; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; rtnl_lock(); rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa, &dev, &secy, &rx_sc, &assoc_num); if (IS_ERR(rx_sa)) { rtnl_unlock(); return PTR_ERR(rx_sa); } if (rx_sa->active) { rtnl_unlock(); return -EBUSY; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.rx_sa = rx_sa; ctx.secy = secy; ret = macsec_offload(ops->mdo_del_rxsa, &ctx); if (ret) goto cleanup; } RCU_INIT_POINTER(rx_sc->sa[assoc_num], NULL); clear_rx_sa(rx_sa); rtnl_unlock(); return 0; cleanup: rtnl_unlock(); return ret; } static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; sci_t sci; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI]) return -EINVAL; rtnl_lock(); dev = get_dev_from_nl(genl_info_net(info), info->attrs); if (IS_ERR(dev)) { rtnl_unlock(); return PTR_ERR(dev); } secy = &macsec_priv(dev)->secy; sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); rx_sc = del_rx_sc(secy, sci); if (!rx_sc) { rtnl_unlock(); return -ENODEV; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.rx_sc = rx_sc; ctx.secy = secy; ret = macsec_offload(ops->mdo_del_rxsc, &ctx); if (ret) goto cleanup; } free_rx_sc(rx_sc); rtnl_unlock(); return 0; cleanup: rtnl_unlock(); return ret; } static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; u8 assoc_num; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; rtnl_lock(); tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa, &dev, &secy, &tx_sc, &assoc_num); if (IS_ERR(tx_sa)) { rtnl_unlock(); return PTR_ERR(tx_sa); } if (tx_sa->active) { rtnl_unlock(); return -EBUSY; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.tx_sa = tx_sa; ctx.secy = secy; ret = macsec_offload(ops->mdo_del_txsa, &ctx); if (ret) goto cleanup; } RCU_INIT_POINTER(tx_sc->sa[assoc_num], NULL); clear_tx_sa(tx_sa); rtnl_unlock(); return 0; cleanup: rtnl_unlock(); return ret; } static bool validate_upd_sa(struct nlattr **attrs) { if (!attrs[MACSEC_SA_ATTR_AN] || attrs[MACSEC_SA_ATTR_KEY] || attrs[MACSEC_SA_ATTR_KEYID] || attrs[MACSEC_SA_ATTR_SSCI] || attrs[MACSEC_SA_ATTR_SALT]) return false; if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) return false; if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) return false; if (attrs[MACSEC_SA_ATTR_ACTIVE]) { if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) return false; } return true; } static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; u8 assoc_num; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; bool was_operational, was_active; pn_t prev_pn; int ret = 0; prev_pn.full64 = 0; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (!validate_upd_sa(tb_sa)) return -EINVAL; rtnl_lock(); tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa, &dev, &secy, &tx_sc, &assoc_num); if (IS_ERR(tx_sa)) { rtnl_unlock(); return PTR_ERR(tx_sa); } if (tb_sa[MACSEC_SA_ATTR_PN]) { int pn_len; pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: upd_txsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); return -EINVAL; } spin_lock_bh(&tx_sa->lock); prev_pn = tx_sa->next_pn_halves; tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&tx_sa->lock); } was_active = tx_sa->active; if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) tx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); was_operational = secy->operational; if (assoc_num == tx_sc->encoding_sa) secy->operational = tx_sa->active; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.tx_sa = tx_sa; ctx.sa.update_pn = !!prev_pn.full64; ctx.secy = secy; ret = macsec_offload(ops->mdo_upd_txsa, &ctx); if (ret) goto cleanup; } rtnl_unlock(); return 0; cleanup: if (tb_sa[MACSEC_SA_ATTR_PN]) { spin_lock_bh(&tx_sa->lock); tx_sa->next_pn_halves = prev_pn; spin_unlock_bh(&tx_sa->lock); } tx_sa->active = was_active; secy->operational = was_operational; rtnl_unlock(); return ret; } static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; u8 assoc_num; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; bool was_active; pn_t prev_pn; int ret = 0; prev_pn.full64 = 0; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (!validate_upd_sa(tb_sa)) return -EINVAL; rtnl_lock(); rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa, &dev, &secy, &rx_sc, &assoc_num); if (IS_ERR(rx_sa)) { rtnl_unlock(); return PTR_ERR(rx_sa); } if (tb_sa[MACSEC_SA_ATTR_PN]) { int pn_len; pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: upd_rxsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); return -EINVAL; } spin_lock_bh(&rx_sa->lock); prev_pn = rx_sa->next_pn_halves; rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&rx_sa->lock); } was_active = rx_sa->active; if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) rx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.rx_sa = rx_sa; ctx.sa.update_pn = !!prev_pn.full64; ctx.secy = secy; ret = macsec_offload(ops->mdo_upd_rxsa, &ctx); if (ret) goto cleanup; } rtnl_unlock(); return 0; cleanup: if (tb_sa[MACSEC_SA_ATTR_PN]) { spin_lock_bh(&rx_sa->lock); rx_sa->next_pn_halves = prev_pn; spin_unlock_bh(&rx_sa->lock); } rx_sa->active = was_active; rtnl_unlock(); return ret; } static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; unsigned int prev_n_rx_sc; bool was_active; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (!validate_add_rxsc(tb_rxsc)) return -EINVAL; rtnl_lock(); rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy); if (IS_ERR(rx_sc)) { rtnl_unlock(); return PTR_ERR(rx_sc); } was_active = rx_sc->active; prev_n_rx_sc = secy->n_rx_sc; if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) { bool new = !!nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]); if (rx_sc->active != new) secy->n_rx_sc += new ? 1 : -1; rx_sc->active = new; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.rx_sc = rx_sc; ctx.secy = secy; ret = macsec_offload(ops->mdo_upd_rxsc, &ctx); if (ret) goto cleanup; } rtnl_unlock(); return 0; cleanup: secy->n_rx_sc = prev_n_rx_sc; rx_sc->active = was_active; rtnl_unlock(); return ret; } static bool macsec_is_configured(struct macsec_dev *macsec) { struct macsec_secy *secy = &macsec->secy; struct macsec_tx_sc *tx_sc = &secy->tx_sc; int i; if (secy->rx_sc) return true; for (i = 0; i < MACSEC_NUM_AN; i++) if (tx_sc->sa[i]) return true; return false; } static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb_offload[MACSEC_OFFLOAD_ATTR_MAX + 1]; enum macsec_offload offload, prev_offload; int (*func)(struct macsec_context *ctx); struct nlattr **attrs = info->attrs; struct net_device *dev; const struct macsec_ops *ops; struct macsec_context ctx; struct macsec_dev *macsec; int ret = 0; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (!attrs[MACSEC_ATTR_OFFLOAD]) return -EINVAL; if (nla_parse_nested_deprecated(tb_offload, MACSEC_OFFLOAD_ATTR_MAX, attrs[MACSEC_ATTR_OFFLOAD], macsec_genl_offload_policy, NULL)) return -EINVAL; rtnl_lock(); dev = get_dev_from_nl(genl_info_net(info), attrs); if (IS_ERR(dev)) { ret = PTR_ERR(dev); goto out; } macsec = macsec_priv(dev); if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]) { ret = -EINVAL; goto out; } offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]); if (macsec->offload == offload) goto out; /* Check if the offloading mode is supported by the underlying layers */ if (offload != MACSEC_OFFLOAD_OFF && !macsec_check_offload(offload, macsec)) { ret = -EOPNOTSUPP; goto out; } /* Check if the net device is busy. */ if (netif_running(dev)) { ret = -EBUSY; goto out; } prev_offload = macsec->offload; macsec->offload = offload; /* Check if the device already has rules configured: we do not support * rules migration. */ if (macsec_is_configured(macsec)) { ret = -EBUSY; goto rollback; } ops = __macsec_get_ops(offload == MACSEC_OFFLOAD_OFF ? prev_offload : offload, macsec, &ctx); if (!ops) { ret = -EOPNOTSUPP; goto rollback; } if (prev_offload == MACSEC_OFFLOAD_OFF) func = ops->mdo_add_secy; else func = ops->mdo_del_secy; ctx.secy = &macsec->secy; ret = macsec_offload(func, &ctx); if (ret) goto rollback; rtnl_unlock(); return 0; rollback: macsec->offload = prev_offload; out: rtnl_unlock(); return ret; } static void get_tx_sa_stats(struct net_device *dev, int an, struct macsec_tx_sa *tx_sa, struct macsec_tx_sa_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.sa.assoc_num = an; ctx.sa.tx_sa = tx_sa; ctx.stats.tx_sa_stats = sum; ctx.secy = &macsec_priv(dev)->secy; macsec_offload(ops->mdo_get_tx_sa_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct macsec_tx_sa_stats *stats = per_cpu_ptr(tx_sa->stats, cpu); sum->OutPktsProtected += stats->OutPktsProtected; sum->OutPktsEncrypted += stats->OutPktsEncrypted; } } static int copy_tx_sa_stats(struct sk_buff *skb, struct macsec_tx_sa_stats *sum) { if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED, sum->OutPktsProtected) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED, sum->OutPktsEncrypted)) return -EMSGSIZE; return 0; } static void get_rx_sa_stats(struct net_device *dev, struct macsec_rx_sc *rx_sc, int an, struct macsec_rx_sa *rx_sa, struct macsec_rx_sa_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.sa.assoc_num = an; ctx.sa.rx_sa = rx_sa; ctx.stats.rx_sa_stats = sum; ctx.secy = &macsec_priv(dev)->secy; ctx.rx_sc = rx_sc; macsec_offload(ops->mdo_get_rx_sa_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct macsec_rx_sa_stats *stats = per_cpu_ptr(rx_sa->stats, cpu); sum->InPktsOK += stats->InPktsOK; sum->InPktsInvalid += stats->InPktsInvalid; sum->InPktsNotValid += stats->InPktsNotValid; sum->InPktsNotUsingSA += stats->InPktsNotUsingSA; sum->InPktsUnusedSA += stats->InPktsUnusedSA; } } static int copy_rx_sa_stats(struct sk_buff *skb, struct macsec_rx_sa_stats *sum) { if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_OK, sum->InPktsOK) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID, sum->InPktsInvalid) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID, sum->InPktsNotValid) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA, sum->InPktsNotUsingSA) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA, sum->InPktsUnusedSA)) return -EMSGSIZE; return 0; } static void get_rx_sc_stats(struct net_device *dev, struct macsec_rx_sc *rx_sc, struct macsec_rx_sc_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.stats.rx_sc_stats = sum; ctx.secy = &macsec_priv(dev)->secy; ctx.rx_sc = rx_sc; macsec_offload(ops->mdo_get_rx_sc_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct pcpu_rx_sc_stats *stats; struct macsec_rx_sc_stats tmp; unsigned int start; stats = per_cpu_ptr(rx_sc->stats, cpu); do { start = u64_stats_fetch_begin_irq(&stats->syncp); memcpy(&tmp, &stats->stats, sizeof(tmp)); } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); sum->InOctetsValidated += tmp.InOctetsValidated; sum->InOctetsDecrypted += tmp.InOctetsDecrypted; sum->InPktsUnchecked += tmp.InPktsUnchecked; sum->InPktsDelayed += tmp.InPktsDelayed; sum->InPktsOK += tmp.InPktsOK; sum->InPktsInvalid += tmp.InPktsInvalid; sum->InPktsLate += tmp.InPktsLate; sum->InPktsNotValid += tmp.InPktsNotValid; sum->InPktsNotUsingSA += tmp.InPktsNotUsingSA; sum->InPktsUnusedSA += tmp.InPktsUnusedSA; } } static int copy_rx_sc_stats(struct sk_buff *skb, struct macsec_rx_sc_stats *sum) { if (nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED, sum->InOctetsValidated, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED, sum->InOctetsDecrypted, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED, sum->InPktsUnchecked, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED, sum->InPktsDelayed, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK, sum->InPktsOK, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID, sum->InPktsInvalid, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE, sum->InPktsLate, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID, sum->InPktsNotValid, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA, sum->InPktsNotUsingSA, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA, sum->InPktsUnusedSA, MACSEC_RXSC_STATS_ATTR_PAD)) return -EMSGSIZE; return 0; } static void get_tx_sc_stats(struct net_device *dev, struct macsec_tx_sc_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.stats.tx_sc_stats = sum; ctx.secy = &macsec_priv(dev)->secy; macsec_offload(ops->mdo_get_tx_sc_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct pcpu_tx_sc_stats *stats; struct macsec_tx_sc_stats tmp; unsigned int start; stats = per_cpu_ptr(macsec_priv(dev)->secy.tx_sc.stats, cpu); do { start = u64_stats_fetch_begin_irq(&stats->syncp); memcpy(&tmp, &stats->stats, sizeof(tmp)); } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); sum->OutPktsProtected += tmp.OutPktsProtected; sum->OutPktsEncrypted += tmp.OutPktsEncrypted; sum->OutOctetsProtected += tmp.OutOctetsProtected; sum->OutOctetsEncrypted += tmp.OutOctetsEncrypted; } } static int copy_tx_sc_stats(struct sk_buff *skb, struct macsec_tx_sc_stats *sum) { if (nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED, sum->OutPktsProtected, MACSEC_TXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED, sum->OutPktsEncrypted, MACSEC_TXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED, sum->OutOctetsProtected, MACSEC_TXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED, sum->OutOctetsEncrypted, MACSEC_TXSC_STATS_ATTR_PAD)) return -EMSGSIZE; return 0; } static void get_secy_stats(struct net_device *dev, struct macsec_dev_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.stats.dev_stats = sum; ctx.secy = &macsec_priv(dev)->secy; macsec_offload(ops->mdo_get_dev_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct pcpu_secy_stats *stats; struct macsec_dev_stats tmp; unsigned int start; stats = per_cpu_ptr(macsec_priv(dev)->stats, cpu); do { start = u64_stats_fetch_begin_irq(&stats->syncp); memcpy(&tmp, &stats->stats, sizeof(tmp)); } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); sum->OutPktsUntagged += tmp.OutPktsUntagged; sum->InPktsUntagged += tmp.InPktsUntagged; sum->OutPktsTooLong += tmp.OutPktsTooLong; sum->InPktsNoTag += tmp.InPktsNoTag; sum->InPktsBadTag += tmp.InPktsBadTag; sum->InPktsUnknownSCI += tmp.InPktsUnknownSCI; sum->InPktsNoSCI += tmp.InPktsNoSCI; sum->InPktsOverrun += tmp.InPktsOverrun; } } static int copy_secy_stats(struct sk_buff *skb, struct macsec_dev_stats *sum) { if (nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED, sum->OutPktsUntagged, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED, sum->InPktsUntagged, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG, sum->OutPktsTooLong, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG, sum->InPktsNoTag, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG, sum->InPktsBadTag, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI, sum->InPktsUnknownSCI, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI, sum->InPktsNoSCI, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN, sum->InPktsOverrun, MACSEC_SECY_STATS_ATTR_PAD)) return -EMSGSIZE; return 0; } static int nla_put_secy(struct macsec_secy *secy, struct sk_buff *skb) { struct macsec_tx_sc *tx_sc = &secy->tx_sc; struct nlattr *secy_nest = nla_nest_start_noflag(skb, MACSEC_ATTR_SECY); u64 csid; if (!secy_nest) return 1; switch (secy->key_len) { case MACSEC_GCM_AES_128_SAK_LEN: csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID; break; case MACSEC_GCM_AES_256_SAK_LEN: csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256; break; default: goto cancel; } if (nla_put_sci(skb, MACSEC_SECY_ATTR_SCI, secy->sci, MACSEC_SECY_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_ATTR_CIPHER_SUITE, csid, MACSEC_SECY_ATTR_PAD) || nla_put_u8(skb, MACSEC_SECY_ATTR_ICV_LEN, secy->icv_len) || nla_put_u8(skb, MACSEC_SECY_ATTR_OPER, secy->operational) || nla_put_u8(skb, MACSEC_SECY_ATTR_PROTECT, secy->protect_frames) || nla_put_u8(skb, MACSEC_SECY_ATTR_REPLAY, secy->replay_protect) || nla_put_u8(skb, MACSEC_SECY_ATTR_VALIDATE, secy->validate_frames) || nla_put_u8(skb, MACSEC_SECY_ATTR_ENCRYPT, tx_sc->encrypt) || nla_put_u8(skb, MACSEC_SECY_ATTR_INC_SCI, tx_sc->send_sci) || nla_put_u8(skb, MACSEC_SECY_ATTR_ES, tx_sc->end_station) || nla_put_u8(skb, MACSEC_SECY_ATTR_SCB, tx_sc->scb) || nla_put_u8(skb, MACSEC_SECY_ATTR_ENCODING_SA, tx_sc->encoding_sa)) goto cancel; if (secy->replay_protect) { if (nla_put_u32(skb, MACSEC_SECY_ATTR_WINDOW, secy->replay_window)) goto cancel; } nla_nest_end(skb, secy_nest); return 0; cancel: nla_nest_cancel(skb, secy_nest); return 1; } static noinline_for_stack int dump_secy(struct macsec_secy *secy, struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct macsec_tx_sc_stats tx_sc_stats = {0, }; struct macsec_tx_sa_stats tx_sa_stats = {0, }; struct macsec_rx_sc_stats rx_sc_stats = {0, }; struct macsec_rx_sa_stats rx_sa_stats = {0, }; struct macsec_dev *macsec = netdev_priv(dev); struct macsec_dev_stats dev_stats = {0, }; struct macsec_tx_sc *tx_sc = &secy->tx_sc; struct nlattr *txsa_list, *rxsc_list; struct macsec_rx_sc *rx_sc; struct nlattr *attr; void *hdr; int i, j; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &macsec_fam, NLM_F_MULTI, MACSEC_CMD_GET_TXSC); if (!hdr) return -EMSGSIZE; genl_dump_check_consistent(cb, hdr); if (nla_put_u32(skb, MACSEC_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; attr = nla_nest_start_noflag(skb, MACSEC_ATTR_OFFLOAD); if (!attr) goto nla_put_failure; if (nla_put_u8(skb, MACSEC_OFFLOAD_ATTR_TYPE, macsec->offload)) goto nla_put_failure; nla_nest_end(skb, attr); if (nla_put_secy(secy, skb)) goto nla_put_failure; attr = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSC_STATS); if (!attr) goto nla_put_failure; get_tx_sc_stats(dev, &tx_sc_stats); if (copy_tx_sc_stats(skb, &tx_sc_stats)) { nla_nest_cancel(skb, attr); goto nla_put_failure; } nla_nest_end(skb, attr); attr = nla_nest_start_noflag(skb, MACSEC_ATTR_SECY_STATS); if (!attr) goto nla_put_failure; get_secy_stats(dev, &dev_stats); if (copy_secy_stats(skb, &dev_stats)) { nla_nest_cancel(skb, attr); goto nla_put_failure; } nla_nest_end(skb, attr); txsa_list = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSA_LIST); if (!txsa_list) goto nla_put_failure; for (i = 0, j = 1; i < MACSEC_NUM_AN; i++) { struct macsec_tx_sa *tx_sa = rtnl_dereference(tx_sc->sa[i]); struct nlattr *txsa_nest; u64 pn; int pn_len; if (!tx_sa) continue; txsa_nest = nla_nest_start_noflag(skb, j++); if (!txsa_nest) { nla_nest_cancel(skb, txsa_list); goto nla_put_failure; } attr = nla_nest_start_noflag(skb, MACSEC_SA_ATTR_STATS); if (!attr) { nla_nest_cancel(skb, txsa_nest); nla_nest_cancel(skb, txsa_list); goto nla_put_failure; } memset(&tx_sa_stats, 0, sizeof(tx_sa_stats)); get_tx_sa_stats(dev, i, tx_sa, &tx_sa_stats); if (copy_tx_sa_stats(skb, &tx_sa_stats)) { nla_nest_cancel(skb, attr); nla_nest_cancel(skb, txsa_nest); nla_nest_cancel(skb, txsa_list); goto nla_put_failure; } nla_nest_end(skb, attr); if (secy->xpn) { pn = tx_sa->next_pn; pn_len = MACSEC_XPN_PN_LEN; } else { pn = tx_sa->next_pn_halves.lower; pn_len = MACSEC_DEFAULT_PN_LEN; } if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) || nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) || nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, tx_sa->key.id) || (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, tx_sa->ssci)) || nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, tx_sa->active)) { nla_nest_cancel(skb, txsa_nest); nla_nest_cancel(skb, txsa_list); goto nla_put_failure; } nla_nest_end(skb, txsa_nest); } nla_nest_end(skb, txsa_list); rxsc_list = nla_nest_start_noflag(skb, MACSEC_ATTR_RXSC_LIST); if (!rxsc_list) goto nla_put_failure; j = 1; for_each_rxsc_rtnl(secy, rx_sc) { int k; struct nlattr *rxsa_list; struct nlattr *rxsc_nest = nla_nest_start_noflag(skb, j++); if (!rxsc_nest) { nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } if (nla_put_u8(skb, MACSEC_RXSC_ATTR_ACTIVE, rx_sc->active) || nla_put_sci(skb, MACSEC_RXSC_ATTR_SCI, rx_sc->sci, MACSEC_RXSC_ATTR_PAD)) { nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } attr = nla_nest_start_noflag(skb, MACSEC_RXSC_ATTR_STATS); if (!attr) { nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } memset(&rx_sc_stats, 0, sizeof(rx_sc_stats)); get_rx_sc_stats(dev, rx_sc, &rx_sc_stats); if (copy_rx_sc_stats(skb, &rx_sc_stats)) { nla_nest_cancel(skb, attr); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } nla_nest_end(skb, attr); rxsa_list = nla_nest_start_noflag(skb, MACSEC_RXSC_ATTR_SA_LIST); if (!rxsa_list) { nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } for (i = 0, k = 1; i < MACSEC_NUM_AN; i++) { struct macsec_rx_sa *rx_sa = rtnl_dereference(rx_sc->sa[i]); struct nlattr *rxsa_nest; u64 pn; int pn_len; if (!rx_sa) continue; rxsa_nest = nla_nest_start_noflag(skb, k++); if (!rxsa_nest) { nla_nest_cancel(skb, rxsa_list); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } attr = nla_nest_start_noflag(skb, MACSEC_SA_ATTR_STATS); if (!attr) { nla_nest_cancel(skb, rxsa_list); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } memset(&rx_sa_stats, 0, sizeof(rx_sa_stats)); get_rx_sa_stats(dev, rx_sc, i, rx_sa, &rx_sa_stats); if (copy_rx_sa_stats(skb, &rx_sa_stats)) { nla_nest_cancel(skb, attr); nla_nest_cancel(skb, rxsa_list); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } nla_nest_end(skb, attr); if (secy->xpn) { pn = rx_sa->next_pn; pn_len = MACSEC_XPN_PN_LEN; } else { pn = rx_sa->next_pn_halves.lower; pn_len = MACSEC_DEFAULT_PN_LEN; } if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) || nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) || nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, rx_sa->key.id) || (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, rx_sa->ssci)) || nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, rx_sa->active)) { nla_nest_cancel(skb, rxsa_nest); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } nla_nest_end(skb, rxsa_nest); } nla_nest_end(skb, rxsa_list); nla_nest_end(skb, rxsc_nest); } nla_nest_end(skb, rxsc_list); genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int macsec_generation = 1; /* protected by RTNL */ static int macsec_dump_txsc(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct net_device *dev; int dev_idx, d; dev_idx = cb->args[0]; d = 0; rtnl_lock(); cb->seq = macsec_generation; for_each_netdev(net, dev) { struct macsec_secy *secy; if (d < dev_idx) goto next; if (!netif_is_macsec(dev)) goto next; secy = &macsec_priv(dev)->secy; if (dump_secy(secy, dev, skb, cb) < 0) goto done; next: d++; } done: rtnl_unlock(); cb->args[0] = d; return skb->len; } static const struct genl_small_ops macsec_genl_ops[] = { { .cmd = MACSEC_CMD_GET_TXSC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = macsec_dump_txsc, }, { .cmd = MACSEC_CMD_ADD_RXSC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_add_rxsc, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_DEL_RXSC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_del_rxsc, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_UPD_RXSC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_upd_rxsc, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_ADD_TXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_add_txsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_DEL_TXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_del_txsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_UPD_TXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_upd_txsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_ADD_RXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_add_rxsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_DEL_RXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_del_rxsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_UPD_RXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_upd_rxsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_UPD_OFFLOAD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_upd_offload, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family macsec_fam __ro_after_init = { .name = MACSEC_GENL_NAME, .hdrsize = 0, .version = MACSEC_GENL_VERSION, .maxattr = MACSEC_ATTR_MAX, .policy = macsec_genl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = macsec_genl_ops, .n_small_ops = ARRAY_SIZE(macsec_genl_ops), }; static netdev_tx_t macsec_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct macsec_dev *macsec = netdev_priv(dev); struct macsec_secy *secy = &macsec->secy; struct pcpu_secy_stats *secy_stats; int ret, len; if (macsec_is_offloaded(netdev_priv(dev))) { skb->dev = macsec->real_dev; return dev_queue_xmit(skb); } /* 10.5 */ if (!secy->protect_frames) { secy_stats = this_cpu_ptr(macsec->stats); u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.OutPktsUntagged++; u64_stats_update_end(&secy_stats->syncp); skb->dev = macsec->real_dev; len = skb->len; ret = dev_queue_xmit(skb); count_tx(dev, ret, len); return ret; } if (!secy->operational) { kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } len = skb->len; skb = macsec_encrypt(skb, dev); if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EINPROGRESS) DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa); macsec_encrypt_finish(skb, dev); ret = dev_queue_xmit(skb); count_tx(dev, ret, len); return ret; } #define MACSEC_FEATURES \ (NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST) static int macsec_dev_init(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; int err; dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; err = gro_cells_init(&macsec->gro_cells, dev); if (err) { free_percpu(dev->tstats); return err; } dev->features = real_dev->features & MACSEC_FEATURES; dev->features |= NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE; dev->needed_headroom = real_dev->needed_headroom + MACSEC_NEEDED_HEADROOM; dev->needed_tailroom = real_dev->needed_tailroom + MACSEC_NEEDED_TAILROOM; if (is_zero_ether_addr(dev->dev_addr)) eth_hw_addr_inherit(dev, real_dev); if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); return 0; } static void macsec_dev_uninit(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); gro_cells_destroy(&macsec->gro_cells); free_percpu(dev->tstats); } static netdev_features_t macsec_fix_features(struct net_device *dev, netdev_features_t features) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; features &= (real_dev->features & MACSEC_FEATURES) | NETIF_F_GSO_SOFTWARE | NETIF_F_SOFT_FEATURES; features |= NETIF_F_LLTX; return features; } static int macsec_dev_open(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; int err; err = dev_uc_add(real_dev, dev->dev_addr); if (err < 0) return err; if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(real_dev, 1); if (err < 0) goto del_unicast; } if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(real_dev, 1); if (err < 0) goto clear_allmulti; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { err = -EOPNOTSUPP; goto clear_allmulti; } ctx.secy = &macsec->secy; err = macsec_offload(ops->mdo_dev_open, &ctx); if (err) goto clear_allmulti; } if (netif_carrier_ok(real_dev)) netif_carrier_on(dev); return 0; clear_allmulti: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(real_dev, -1); del_unicast: dev_uc_del(real_dev, dev->dev_addr); netif_carrier_off(dev); return err; } static int macsec_dev_stop(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; netif_carrier_off(dev); /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.secy = &macsec->secy; macsec_offload(ops->mdo_dev_stop, &ctx); } } dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(real_dev, -1); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(real_dev, -1); dev_uc_del(real_dev, dev->dev_addr); return 0; } static void macsec_dev_change_rx_flags(struct net_device *dev, int change) { struct net_device *real_dev = macsec_priv(dev)->real_dev; if (!(dev->flags & IFF_UP)) return; if (change & IFF_ALLMULTI) dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); } static void macsec_dev_set_rx_mode(struct net_device *dev) { struct net_device *real_dev = macsec_priv(dev)->real_dev; dev_mc_sync(real_dev, dev); dev_uc_sync(real_dev, dev); } static int macsec_set_mac_address(struct net_device *dev, void *p) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; struct sockaddr *addr = p; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; if (!(dev->flags & IFF_UP)) goto out; err = dev_uc_add(real_dev, addr->sa_data); if (err < 0) return err; dev_uc_del(real_dev, dev->dev_addr); out: eth_hw_addr_set(dev, addr->sa_data); /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.secy = &macsec->secy; macsec_offload(ops->mdo_upd_secy, &ctx); } } return 0; } static int macsec_change_mtu(struct net_device *dev, int new_mtu) { struct macsec_dev *macsec = macsec_priv(dev); unsigned int extra = macsec->secy.icv_len + macsec_extra_len(true); if (macsec->real_dev->mtu - extra < new_mtu) return -ERANGE; dev->mtu = new_mtu; return 0; } static void macsec_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { if (!dev->tstats) return; dev_fetch_sw_netstats(s, dev->tstats); s->rx_dropped = DEV_STATS_READ(dev, rx_dropped); s->tx_dropped = DEV_STATS_READ(dev, tx_dropped); s->rx_errors = DEV_STATS_READ(dev, rx_errors); } static int macsec_get_iflink(const struct net_device *dev) { return macsec_priv(dev)->real_dev->ifindex; } static const struct net_device_ops macsec_netdev_ops = { .ndo_init = macsec_dev_init, .ndo_uninit = macsec_dev_uninit, .ndo_open = macsec_dev_open, .ndo_stop = macsec_dev_stop, .ndo_fix_features = macsec_fix_features, .ndo_change_mtu = macsec_change_mtu, .ndo_set_rx_mode = macsec_dev_set_rx_mode, .ndo_change_rx_flags = macsec_dev_change_rx_flags, .ndo_set_mac_address = macsec_set_mac_address, .ndo_start_xmit = macsec_start_xmit, .ndo_get_stats64 = macsec_get_stats64, .ndo_get_iflink = macsec_get_iflink, }; static const struct device_type macsec_type = { .name = "macsec", }; static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = { [IFLA_MACSEC_SCI] = { .type = NLA_U64 }, [IFLA_MACSEC_PORT] = { .type = NLA_U16 }, [IFLA_MACSEC_ICV_LEN] = { .type = NLA_U8 }, [IFLA_MACSEC_CIPHER_SUITE] = { .type = NLA_U64 }, [IFLA_MACSEC_WINDOW] = { .type = NLA_U32 }, [IFLA_MACSEC_ENCODING_SA] = { .type = NLA_U8 }, [IFLA_MACSEC_ENCRYPT] = { .type = NLA_U8 }, [IFLA_MACSEC_PROTECT] = { .type = NLA_U8 }, [IFLA_MACSEC_INC_SCI] = { .type = NLA_U8 }, [IFLA_MACSEC_ES] = { .type = NLA_U8 }, [IFLA_MACSEC_SCB] = { .type = NLA_U8 }, [IFLA_MACSEC_REPLAY_PROTECT] = { .type = NLA_U8 }, [IFLA_MACSEC_VALIDATION] = { .type = NLA_U8 }, [IFLA_MACSEC_OFFLOAD] = { .type = NLA_U8 }, }; static void macsec_free_netdev(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); free_percpu(macsec->stats); free_percpu(macsec->secy.tx_sc.stats); } static void macsec_setup(struct net_device *dev) { ether_setup(dev); dev->min_mtu = 0; dev->max_mtu = ETH_MAX_MTU; dev->priv_flags |= IFF_NO_QUEUE; dev->netdev_ops = &macsec_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = macsec_free_netdev; SET_NETDEV_DEVTYPE(dev, &macsec_type); eth_zero_addr(dev->broadcast); } static int macsec_changelink_common(struct net_device *dev, struct nlattr *data[]) { struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; secy = &macsec_priv(dev)->secy; tx_sc = &secy->tx_sc; if (data[IFLA_MACSEC_ENCODING_SA]) { struct macsec_tx_sa *tx_sa; tx_sc->encoding_sa = nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]); tx_sa = rtnl_dereference(tx_sc->sa[tx_sc->encoding_sa]); secy->operational = tx_sa && tx_sa->active; } if (data[IFLA_MACSEC_ENCRYPT]) tx_sc->encrypt = !!nla_get_u8(data[IFLA_MACSEC_ENCRYPT]); if (data[IFLA_MACSEC_PROTECT]) secy->protect_frames = !!nla_get_u8(data[IFLA_MACSEC_PROTECT]); if (data[IFLA_MACSEC_INC_SCI]) tx_sc->send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]); if (data[IFLA_MACSEC_ES]) tx_sc->end_station = !!nla_get_u8(data[IFLA_MACSEC_ES]); if (data[IFLA_MACSEC_SCB]) tx_sc->scb = !!nla_get_u8(data[IFLA_MACSEC_SCB]); if (data[IFLA_MACSEC_REPLAY_PROTECT]) secy->replay_protect = !!nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT]); if (data[IFLA_MACSEC_VALIDATION]) secy->validate_frames = nla_get_u8(data[IFLA_MACSEC_VALIDATION]); if (data[IFLA_MACSEC_CIPHER_SUITE]) { switch (nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE])) { case MACSEC_CIPHER_ID_GCM_AES_128: case MACSEC_DEFAULT_CIPHER_ID: secy->key_len = MACSEC_GCM_AES_128_SAK_LEN; secy->xpn = false; break; case MACSEC_CIPHER_ID_GCM_AES_256: secy->key_len = MACSEC_GCM_AES_256_SAK_LEN; secy->xpn = false; break; case MACSEC_CIPHER_ID_GCM_AES_XPN_128: secy->key_len = MACSEC_GCM_AES_128_SAK_LEN; secy->xpn = true; break; case MACSEC_CIPHER_ID_GCM_AES_XPN_256: secy->key_len = MACSEC_GCM_AES_256_SAK_LEN; secy->xpn = true; break; default: return -EINVAL; } } if (data[IFLA_MACSEC_WINDOW]) { secy->replay_window = nla_get_u32(data[IFLA_MACSEC_WINDOW]); /* IEEE 802.1AEbw-2013 10.7.8 - maximum replay window * for XPN cipher suites */ if (secy->xpn && secy->replay_window > MACSEC_XPN_MAX_REPLAY_WINDOW) return -EINVAL; } return 0; } static int macsec_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macsec_dev *macsec = macsec_priv(dev); struct macsec_tx_sc tx_sc; struct macsec_secy secy; int ret; if (!data) return 0; if (data[IFLA_MACSEC_CIPHER_SUITE] || data[IFLA_MACSEC_ICV_LEN] || data[IFLA_MACSEC_SCI] || data[IFLA_MACSEC_PORT]) return -EINVAL; /* Keep a copy of unmodified secy and tx_sc, in case the offload * propagation fails, to revert macsec_changelink_common. */ memcpy(&secy, &macsec->secy, sizeof(secy)); memcpy(&tx_sc, &macsec->secy.tx_sc, sizeof(tx_sc)); ret = macsec_changelink_common(dev, data); if (ret) goto cleanup; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.secy = &macsec->secy; ret = macsec_offload(ops->mdo_upd_secy, &ctx); if (ret) goto cleanup; } return 0; cleanup: memcpy(&macsec->secy.tx_sc, &tx_sc, sizeof(tx_sc)); memcpy(&macsec->secy, &secy, sizeof(secy)); return ret; } static void macsec_del_dev(struct macsec_dev *macsec) { int i; while (macsec->secy.rx_sc) { struct macsec_rx_sc *rx_sc = rtnl_dereference(macsec->secy.rx_sc); rcu_assign_pointer(macsec->secy.rx_sc, rx_sc->next); free_rx_sc(rx_sc); } for (i = 0; i < MACSEC_NUM_AN; i++) { struct macsec_tx_sa *sa = rtnl_dereference(macsec->secy.tx_sc.sa[i]); if (sa) { RCU_INIT_POINTER(macsec->secy.tx_sc.sa[i], NULL); clear_tx_sa(sa); } } } static void macsec_common_dellink(struct net_device *dev, struct list_head *head) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (ops) { ctx.secy = &macsec->secy; macsec_offload(ops->mdo_del_secy, &ctx); } } unregister_netdevice_queue(dev, head); list_del_rcu(&macsec->secys); macsec_del_dev(macsec); netdev_upper_dev_unlink(real_dev, dev); macsec_generation++; } static void macsec_dellink(struct net_device *dev, struct list_head *head) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); macsec_common_dellink(dev, head); if (list_empty(&rxd->secys)) { netdev_rx_handler_unregister(real_dev); kfree(rxd); } } static int register_macsec_dev(struct net_device *real_dev, struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); if (!rxd) { int err; rxd = kmalloc(sizeof(*rxd), GFP_KERNEL); if (!rxd) return -ENOMEM; INIT_LIST_HEAD(&rxd->secys); err = netdev_rx_handler_register(real_dev, macsec_handle_frame, rxd); if (err < 0) { kfree(rxd); return err; } } list_add_tail_rcu(&macsec->secys, &rxd->secys); return 0; } static bool sci_exists(struct net_device *dev, sci_t sci) { struct macsec_rxh_data *rxd = macsec_data_rtnl(dev); struct macsec_dev *macsec; list_for_each_entry(macsec, &rxd->secys, secys) { if (macsec->secy.sci == sci) return true; } return false; } static sci_t dev_to_sci(struct net_device *dev, __be16 port) { return make_sci(dev->dev_addr, port); } static int macsec_add_dev(struct net_device *dev, sci_t sci, u8 icv_len) { struct macsec_dev *macsec = macsec_priv(dev); struct macsec_secy *secy = &macsec->secy; macsec->stats = netdev_alloc_pcpu_stats(struct pcpu_secy_stats); if (!macsec->stats) return -ENOMEM; secy->tx_sc.stats = netdev_alloc_pcpu_stats(struct pcpu_tx_sc_stats); if (!secy->tx_sc.stats) { free_percpu(macsec->stats); return -ENOMEM; } if (sci == MACSEC_UNDEF_SCI) sci = dev_to_sci(dev, MACSEC_PORT_ES); secy->netdev = dev; secy->operational = true; secy->key_len = DEFAULT_SAK_LEN; secy->icv_len = icv_len; secy->validate_frames = MACSEC_VALIDATE_DEFAULT; secy->protect_frames = true; secy->replay_protect = false; secy->xpn = DEFAULT_XPN; secy->sci = sci; secy->tx_sc.active = true; secy->tx_sc.encoding_sa = DEFAULT_ENCODING_SA; secy->tx_sc.encrypt = DEFAULT_ENCRYPT; secy->tx_sc.send_sci = DEFAULT_SEND_SCI; secy->tx_sc.end_station = false; secy->tx_sc.scb = false; return 0; } static struct lock_class_key macsec_netdev_addr_lock_key; static int macsec_newlink(struct net *net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macsec_dev *macsec = macsec_priv(dev); rx_handler_func_t *rx_handler; u8 icv_len = DEFAULT_ICV_LEN; struct net_device *real_dev; int err, mtu; sci_t sci; if (!tb[IFLA_LINK]) return -EINVAL; real_dev = __dev_get_by_index(net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) return -ENODEV; if (real_dev->type != ARPHRD_ETHER) return -EINVAL; dev->priv_flags |= IFF_MACSEC; macsec->real_dev = real_dev; if (data && data[IFLA_MACSEC_OFFLOAD]) macsec->offload = nla_get_offload(data[IFLA_MACSEC_OFFLOAD]); else /* MACsec offloading is off by default */ macsec->offload = MACSEC_OFFLOAD_OFF; /* Check if the offloading mode is supported by the underlying layers */ if (macsec->offload != MACSEC_OFFLOAD_OFF && !macsec_check_offload(macsec->offload, macsec)) return -EOPNOTSUPP; /* send_sci must be set to true when transmit sci explicitly is set */ if ((data && data[IFLA_MACSEC_SCI]) && (data && data[IFLA_MACSEC_INC_SCI])) { u8 send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]); if (!send_sci) return -EINVAL; } if (data && data[IFLA_MACSEC_ICV_LEN]) icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); mtu = real_dev->mtu - icv_len - macsec_extra_len(true); if (mtu < 0) dev->mtu = 0; else dev->mtu = mtu; rx_handler = rtnl_dereference(real_dev->rx_handler); if (rx_handler && rx_handler != macsec_handle_frame) return -EBUSY; err = register_netdevice(dev); if (err < 0) return err; netdev_lockdep_set_classes(dev); lockdep_set_class(&dev->addr_list_lock, &macsec_netdev_addr_lock_key); err = netdev_upper_dev_link(real_dev, dev, extack); if (err < 0) goto unregister; /* need to be already registered so that ->init has run and * the MAC addr is set */ if (data && data[IFLA_MACSEC_SCI]) sci = nla_get_sci(data[IFLA_MACSEC_SCI]); else if (data && data[IFLA_MACSEC_PORT]) sci = dev_to_sci(dev, nla_get_be16(data[IFLA_MACSEC_PORT])); else sci = dev_to_sci(dev, MACSEC_PORT_ES); if (rx_handler && sci_exists(real_dev, sci)) { err = -EBUSY; goto unlink; } err = macsec_add_dev(dev, sci, icv_len); if (err) goto unlink; if (data) { err = macsec_changelink_common(dev, data); if (err) goto del_dev; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.secy = &macsec->secy; err = macsec_offload(ops->mdo_add_secy, &ctx); if (err) goto del_dev; } } err = register_macsec_dev(real_dev, dev); if (err < 0) goto del_dev; netif_stacked_transfer_operstate(real_dev, dev); linkwatch_fire_event(dev); macsec_generation++; return 0; del_dev: macsec_del_dev(macsec); unlink: netdev_upper_dev_unlink(real_dev, dev); unregister: unregister_netdevice(dev); return err; } static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u64 csid = MACSEC_DEFAULT_CIPHER_ID; u8 icv_len = DEFAULT_ICV_LEN; int flag; bool es, scb, sci; if (!data) return 0; if (data[IFLA_MACSEC_CIPHER_SUITE]) csid = nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE]); if (data[IFLA_MACSEC_ICV_LEN]) { icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); if (icv_len != DEFAULT_ICV_LEN) { char dummy_key[DEFAULT_SAK_LEN] = { 0 }; struct crypto_aead *dummy_tfm; dummy_tfm = macsec_alloc_tfm(dummy_key, DEFAULT_SAK_LEN, icv_len); if (IS_ERR(dummy_tfm)) return PTR_ERR(dummy_tfm); crypto_free_aead(dummy_tfm); } } switch (csid) { case MACSEC_CIPHER_ID_GCM_AES_128: case MACSEC_CIPHER_ID_GCM_AES_256: case MACSEC_CIPHER_ID_GCM_AES_XPN_128: case MACSEC_CIPHER_ID_GCM_AES_XPN_256: case MACSEC_DEFAULT_CIPHER_ID: if (icv_len < MACSEC_MIN_ICV_LEN || icv_len > MACSEC_STD_ICV_LEN) return -EINVAL; break; default: return -EINVAL; } if (data[IFLA_MACSEC_ENCODING_SA]) { if (nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]) >= MACSEC_NUM_AN) return -EINVAL; } for (flag = IFLA_MACSEC_ENCODING_SA + 1; flag < IFLA_MACSEC_VALIDATION; flag++) { if (data[flag]) { if (nla_get_u8(data[flag]) > 1) return -EINVAL; } } es = data[IFLA_MACSEC_ES] ? nla_get_u8(data[IFLA_MACSEC_ES]) : false; sci = data[IFLA_MACSEC_INC_SCI] ? nla_get_u8(data[IFLA_MACSEC_INC_SCI]) : false; scb = data[IFLA_MACSEC_SCB] ? nla_get_u8(data[IFLA_MACSEC_SCB]) : false; if ((sci && (scb || es)) || (scb && es)) return -EINVAL; if (data[IFLA_MACSEC_VALIDATION] && nla_get_u8(data[IFLA_MACSEC_VALIDATION]) > MACSEC_VALIDATE_MAX) return -EINVAL; if ((data[IFLA_MACSEC_REPLAY_PROTECT] && nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT])) && !data[IFLA_MACSEC_WINDOW]) return -EINVAL; return 0; } static struct net *macsec_get_link_net(const struct net_device *dev) { return dev_net(macsec_priv(dev)->real_dev); } static size_t macsec_get_size(const struct net_device *dev) { return nla_total_size_64bit(8) + /* IFLA_MACSEC_SCI */ nla_total_size(1) + /* IFLA_MACSEC_ICV_LEN */ nla_total_size_64bit(8) + /* IFLA_MACSEC_CIPHER_SUITE */ nla_total_size(4) + /* IFLA_MACSEC_WINDOW */ nla_total_size(1) + /* IFLA_MACSEC_ENCODING_SA */ nla_total_size(1) + /* IFLA_MACSEC_ENCRYPT */ nla_total_size(1) + /* IFLA_MACSEC_PROTECT */ nla_total_size(1) + /* IFLA_MACSEC_INC_SCI */ nla_total_size(1) + /* IFLA_MACSEC_ES */ nla_total_size(1) + /* IFLA_MACSEC_SCB */ nla_total_size(1) + /* IFLA_MACSEC_REPLAY_PROTECT */ nla_total_size(1) + /* IFLA_MACSEC_VALIDATION */ 0; } static int macsec_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct macsec_secy *secy = &macsec_priv(dev)->secy; struct macsec_tx_sc *tx_sc = &secy->tx_sc; u64 csid; switch (secy->key_len) { case MACSEC_GCM_AES_128_SAK_LEN: csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID; break; case MACSEC_GCM_AES_256_SAK_LEN: csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256; break; default: goto nla_put_failure; } if (nla_put_sci(skb, IFLA_MACSEC_SCI, secy->sci, IFLA_MACSEC_PAD) || nla_put_u8(skb, IFLA_MACSEC_ICV_LEN, secy->icv_len) || nla_put_u64_64bit(skb, IFLA_MACSEC_CIPHER_SUITE, csid, IFLA_MACSEC_PAD) || nla_put_u8(skb, IFLA_MACSEC_ENCODING_SA, tx_sc->encoding_sa) || nla_put_u8(skb, IFLA_MACSEC_ENCRYPT, tx_sc->encrypt) || nla_put_u8(skb, IFLA_MACSEC_PROTECT, secy->protect_frames) || nla_put_u8(skb, IFLA_MACSEC_INC_SCI, tx_sc->send_sci) || nla_put_u8(skb, IFLA_MACSEC_ES, tx_sc->end_station) || nla_put_u8(skb, IFLA_MACSEC_SCB, tx_sc->scb) || nla_put_u8(skb, IFLA_MACSEC_REPLAY_PROTECT, secy->replay_protect) || nla_put_u8(skb, IFLA_MACSEC_VALIDATION, secy->validate_frames) || 0) goto nla_put_failure; if (secy->replay_protect) { if (nla_put_u32(skb, IFLA_MACSEC_WINDOW, secy->replay_window)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops macsec_link_ops __read_mostly = { .kind = "macsec", .priv_size = sizeof(struct macsec_dev), .maxtype = IFLA_MACSEC_MAX, .policy = macsec_rtnl_policy, .setup = macsec_setup, .validate = macsec_validate_attr, .newlink = macsec_newlink, .changelink = macsec_changelink, .dellink = macsec_dellink, .get_size = macsec_get_size, .fill_info = macsec_fill_info, .get_link_net = macsec_get_link_net, }; static bool is_macsec_master(struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == macsec_handle_frame; } static int macsec_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *real_dev = netdev_notifier_info_to_dev(ptr); LIST_HEAD(head); if (!is_macsec_master(real_dev)) return NOTIFY_DONE; switch (event) { case NETDEV_DOWN: case NETDEV_UP: case NETDEV_CHANGE: { struct macsec_dev *m, *n; struct macsec_rxh_data *rxd; rxd = macsec_data_rtnl(real_dev); list_for_each_entry_safe(m, n, &rxd->secys, secys) { struct net_device *dev = m->secy.netdev; netif_stacked_transfer_operstate(real_dev, dev); } break; } case NETDEV_UNREGISTER: { struct macsec_dev *m, *n; struct macsec_rxh_data *rxd; rxd = macsec_data_rtnl(real_dev); list_for_each_entry_safe(m, n, &rxd->secys, secys) { macsec_common_dellink(m->secy.netdev, &head); } netdev_rx_handler_unregister(real_dev); kfree(rxd); unregister_netdevice_many(&head); break; } case NETDEV_CHANGEMTU: { struct macsec_dev *m; struct macsec_rxh_data *rxd; rxd = macsec_data_rtnl(real_dev); list_for_each_entry(m, &rxd->secys, secys) { struct net_device *dev = m->secy.netdev; unsigned int mtu = real_dev->mtu - (m->secy.icv_len + macsec_extra_len(true)); if (dev->mtu > mtu) dev_set_mtu(dev, mtu); } } } return NOTIFY_OK; } static struct notifier_block macsec_notifier = { .notifier_call = macsec_notify, }; static int __init macsec_init(void) { int err; pr_info("MACsec IEEE 802.1AE\n"); err = register_netdevice_notifier(&macsec_notifier); if (err) return err; err = rtnl_link_register(&macsec_link_ops); if (err) goto notifier; err = genl_register_family(&macsec_fam); if (err) goto rtnl; return 0; rtnl: rtnl_link_unregister(&macsec_link_ops); notifier: unregister_netdevice_notifier(&macsec_notifier); return err; } static void __exit macsec_exit(void) { genl_unregister_family(&macsec_fam); rtnl_link_unregister(&macsec_link_ops); unregister_netdevice_notifier(&macsec_notifier); rcu_barrier(); } module_init(macsec_init); module_exit(macsec_exit); MODULE_ALIAS_RTNL_LINK("macsec"); MODULE_ALIAS_GENL_FAMILY("macsec"); MODULE_DESCRIPTION("MACsec IEEE 802.1AE"); MODULE_LICENSE("GPL v2");
13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 /* SPDX-License-Identifier: GPL-2.0 */ /* * Multipath TCP * * Copyright (c) 2017 - 2019, Intel Corporation. */ #ifndef __NET_MPTCP_H #define __NET_MPTCP_H #include <linux/skbuff.h> #include <linux/tcp.h> #include <linux/types.h> struct seq_file; /* MPTCP sk_buff extension data */ struct mptcp_ext { union { u64 data_ack; u32 data_ack32; }; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; u8 use_map:1, dsn64:1, data_fin:1, use_ack:1, ack64:1, mpc_map:1, frozen:1, reset_transient:1; u8 reset_reason:4, csum_reqd:1; }; #define MPTCP_RM_IDS_MAX 8 struct mptcp_rm_list { u8 ids[MPTCP_RM_IDS_MAX]; u8 nr; }; struct mptcp_addr_info { u8 id; sa_family_t family; __be16 port; union { struct in_addr addr; #if IS_ENABLED(CONFIG_MPTCP_IPV6) struct in6_addr addr6; #endif }; }; struct mptcp_out_options { #if IS_ENABLED(CONFIG_MPTCP) u16 suboptions; struct mptcp_rm_list rm_list; u8 join_id; u8 backup; u8 reset_reason:4, reset_transient:1, csum_reqd:1, allow_join_id0:1; union { struct { u64 sndr_key; u64 rcvr_key; u64 data_seq; u32 subflow_seq; u16 data_len; __sum16 csum; }; struct { struct mptcp_addr_info addr; u64 ahmac; }; struct { struct mptcp_ext ext_copy; u64 fail_seq; }; struct { u32 nonce; u32 token; u64 thmac; u8 hmac[20]; }; }; #endif }; #ifdef CONFIG_MPTCP void mptcp_init(void); static inline bool sk_is_mptcp(const struct sock *sk) { return tcp_sk(sk)->is_mptcp; } static inline bool rsk_is_mptcp(const struct request_sock *req) { return tcp_rsk(req)->is_mptcp; } static inline bool rsk_drop_req(const struct request_sock *req) { return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req; } void mptcp_space(const struct sock *ssk, int *space, int *full_space); bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts); bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts); bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts); bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb); void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp, struct mptcp_out_options *opts); /* move the skb extension owership, with the assumption that 'to' is * newly allocated */ static inline void mptcp_skb_ext_move(struct sk_buff *to, struct sk_buff *from) { if (!skb_ext_exist(from, SKB_EXT_MPTCP)) return; if (WARN_ON_ONCE(to->active_extensions)) skb_ext_put(to); to->active_extensions = from->active_extensions; to->extensions = from->extensions; from->active_extensions = 0; } static inline void mptcp_skb_ext_copy(struct sk_buff *to, struct sk_buff *from) { struct mptcp_ext *from_ext; from_ext = skb_ext_find(from, SKB_EXT_MPTCP); if (!from_ext) return; from_ext->frozen = 1; skb_ext_copy(to, from); } static inline bool mptcp_ext_matches(const struct mptcp_ext *to_ext, const struct mptcp_ext *from_ext) { /* MPTCP always clears the ext when adding it to the skb, so * holes do not bother us here */ return !from_ext || (to_ext && from_ext && !memcmp(from_ext, to_ext, sizeof(struct mptcp_ext))); } /* check if skbs can be collapsed. * MPTCP collapse is allowed if neither @to or @from carry an mptcp data * mapping, or if the extension of @to is the same as @from. * Collapsing is not possible if @to lacks an extension, but @from carries one. */ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return mptcp_ext_matches(skb_ext_find(to, SKB_EXT_MPTCP), skb_ext_find(from, SKB_EXT_MPTCP)); } void mptcp_seq_show(struct seq_file *seq); int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb); struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener); __be32 mptcp_get_reset_option(const struct sk_buff *skb); static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { if (skb_ext_exist(skb, SKB_EXT_MPTCP)) return mptcp_get_reset_option(skb); return htonl(0u); } #else static inline void mptcp_init(void) { } static inline bool sk_is_mptcp(const struct sock *sk) { return false; } static inline bool rsk_is_mptcp(const struct request_sock *req) { return false; } static inline bool rsk_drop_req(const struct request_sock *req) { return false; } static inline void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, int opsize, struct tcp_options_received *opt_rx) { } static inline bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, unsigned int *size, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, unsigned int *size, unsigned int remaining, struct mptcp_out_options *opts) { return false; } static inline bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { return true; } static inline void mptcp_skb_ext_move(struct sk_buff *to, const struct sk_buff *from) { } static inline void mptcp_skb_ext_copy(struct sk_buff *to, struct sk_buff *from) { } static inline bool mptcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return true; } static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { } static inline void mptcp_seq_show(struct seq_file *seq) { } static inline int mptcp_subflow_init_cookie_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { return 0; /* TCP fallback */ } static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener) { return NULL; } static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); } #endif /* CONFIG_MPTCP */ #if IS_ENABLED(CONFIG_MPTCP_IPV6) int mptcpv6_init(void); void mptcpv6_handle_mapped(struct sock *sk, bool mapped); #elif IS_ENABLED(CONFIG_IPV6) static inline int mptcpv6_init(void) { return 0; } static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { } #endif #endif /* __NET_MPTCP_H */
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct pause_req_info { struct ethnl_req_info base; }; struct pause_reply_data { struct ethnl_reply_data base; struct ethtool_pauseparam pauseparam; struct ethtool_pause_stats pausestat; }; #define PAUSE_REPDATA(__reply_base) \ container_of(__reply_base, struct pause_reply_data, base) const struct nla_policy ethnl_pause_get_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_stats), }; static int pause_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, struct genl_info *info) { struct pause_reply_data *data = PAUSE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; if (!dev->ethtool_ops->get_pauseparam) return -EOPNOTSUPP; ethtool_stats_init((u64 *)&data->pausestat, sizeof(data->pausestat) / 8); ret = ethnl_ops_begin(dev); if (ret < 0) return ret; dev->ethtool_ops->get_pauseparam(dev, &data->pauseparam); if (req_base->flags & ETHTOOL_FLAG_STATS && dev->ethtool_ops->get_pause_stats) dev->ethtool_ops->get_pause_stats(dev, &data->pausestat); ethnl_ops_complete(dev); return 0; } static int pause_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { int n = nla_total_size(sizeof(u8)) + /* _PAUSE_AUTONEG */ nla_total_size(sizeof(u8)) + /* _PAUSE_RX */ nla_total_size(sizeof(u8)); /* _PAUSE_TX */ if (req_base->flags & ETHTOOL_FLAG_STATS) n += nla_total_size(0) + /* _PAUSE_STATS */ nla_total_size_64bit(sizeof(u64)) * ETHTOOL_PAUSE_STAT_CNT; return n; } static int ethtool_put_stat(struct sk_buff *skb, u64 val, u16 attrtype, u16 padtype) { if (val == ETHTOOL_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, padtype)) return -EMSGSIZE; return 0; } static int pause_put_stats(struct sk_buff *skb, const struct ethtool_pause_stats *pause_stats) { const u16 pad = ETHTOOL_A_PAUSE_STAT_PAD; struct nlattr *nest; nest = nla_nest_start(skb, ETHTOOL_A_PAUSE_STATS); if (!nest) return -EMSGSIZE; if (ethtool_put_stat(skb, pause_stats->tx_pause_frames, ETHTOOL_A_PAUSE_STAT_TX_FRAMES, pad) || ethtool_put_stat(skb, pause_stats->rx_pause_frames, ETHTOOL_A_PAUSE_STAT_RX_FRAMES, pad)) goto err_cancel; nla_nest_end(skb, nest); return 0; err_cancel: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int pause_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pause_reply_data *data = PAUSE_REPDATA(reply_base); const struct ethtool_pauseparam *pauseparam = &data->pauseparam; if (nla_put_u8(skb, ETHTOOL_A_PAUSE_AUTONEG, !!pauseparam->autoneg) || nla_put_u8(skb, ETHTOOL_A_PAUSE_RX, !!pauseparam->rx_pause) || nla_put_u8(skb, ETHTOOL_A_PAUSE_TX, !!pauseparam->tx_pause)) return -EMSGSIZE; if (req_base->flags & ETHTOOL_FLAG_STATS && pause_put_stats(skb, &data->pausestat)) return -EMSGSIZE; return 0; } const struct ethnl_request_ops ethnl_pause_request_ops = { .request_cmd = ETHTOOL_MSG_PAUSE_GET, .reply_cmd = ETHTOOL_MSG_PAUSE_GET_REPLY, .hdr_attr = ETHTOOL_A_PAUSE_HEADER, .req_info_size = sizeof(struct pause_req_info), .reply_data_size = sizeof(struct pause_reply_data), .prepare_data = pause_prepare_data, .reply_size = pause_reply_size, .fill_reply = pause_fill_reply, }; /* PAUSE_SET */ const struct nla_policy ethnl_pause_set_policy[] = { [ETHTOOL_A_PAUSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_PAUSE_AUTONEG] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_RX] = { .type = NLA_U8 }, [ETHTOOL_A_PAUSE_TX] = { .type = NLA_U8 }, }; int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info) { struct ethtool_pauseparam params = {}; struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; const struct ethtool_ops *ops; struct net_device *dev; bool mod = false; int ret; ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_PAUSE_HEADER], genl_info_net(info), info->extack, true); if (ret < 0) return ret; dev = req_info.dev; ops = dev->ethtool_ops; ret = -EOPNOTSUPP; if (!ops->get_pauseparam || !ops->set_pauseparam) goto out_dev; rtnl_lock(); ret = ethnl_ops_begin(dev); if (ret < 0) goto out_rtnl; ops->get_pauseparam(dev, &params); ethnl_update_bool32(&params.autoneg, tb[ETHTOOL_A_PAUSE_AUTONEG], &mod); ethnl_update_bool32(&params.rx_pause, tb[ETHTOOL_A_PAUSE_RX], &mod); ethnl_update_bool32(&params.tx_pause, tb[ETHTOOL_A_PAUSE_TX], &mod); ret = 0; if (!mod) goto out_ops; ret = dev->ethtool_ops->set_pauseparam(dev, &params); if (ret < 0) goto out_ops; ethtool_notify(dev, ETHTOOL_MSG_PAUSE_NTF, NULL); out_ops: ethnl_ops_complete(dev); out_rtnl: rtnl_unlock(); out_dev: dev_put(dev); return ret; }
3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 // SPDX-License-Identifier: GPL-2.0-only /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Generic TIME_WAIT sockets functions * * From code orinally in TCP */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/module.h> #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash * @tw: timewait socket * @hashinfo: hashinfo pointer * * unhash a timewait socket from bind hash, if hashed. * bind hash lock must be held by caller. * Returns 1 if caller should call inet_twsk_put() after lock release. */ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, struct inet_hashinfo *hashinfo) { struct inet_bind_bucket *tb = tw->tw_tb; if (!tb) return; __hlist_del(&tw->tw_bind_node); tw->tw_tb = NULL; inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); __sock_put((struct sock *)tw); } /* Must be called with locally disabled BHs. */ static void inet_twsk_kill(struct inet_timewait_sock *tw) { struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo; spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash); struct inet_bind_hashbucket *bhead; spin_lock(lock); sk_nulls_del_node_init_rcu((struct sock *)tw); spin_unlock(lock); /* Disassociate with bind bucket. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num, hashinfo->bhash_size)]; spin_lock(&bhead->lock); inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead->lock); atomic_dec(&tw->tw_dr->tw_count); inet_twsk_put(tw); } void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; twsk_destructor((struct sock *)tw); #ifdef SOCK_REFCNT_DEBUG pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw); #endif kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } void inet_twsk_put(struct inet_timewait_sock *tw) { if (refcount_dec_and_test(&tw->tw_refcnt)) inet_twsk_free(tw); } EXPORT_SYMBOL_GPL(inet_twsk_put); static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, struct hlist_nulls_head *list) { hlist_nulls_add_head_rcu(&tw->tw_node, list); } static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, struct hlist_head *list) { hlist_add_head(&tw->tw_bind_node, list); } /* * Enter the time wait state. This is called with locally disabled BH. * Essentially we whip up a timewait bucket, copy the relevant info into it * from the SK, and mess with hash chains and list linkage. */ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, struct inet_hashinfo *hashinfo) { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); struct inet_bind_hashbucket *bhead; /* Step 1: Put TW into bind hash. Original socket stays there too. Note, that any socket with inet->num != 0 MUST be bound in binding cache, even if it is closed. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; spin_lock(&bhead->lock); tw->tw_tb = icsk->icsk_bind_hash; WARN_ON(!icsk->icsk_bind_hash); inet_twsk_add_bind_node(tw, &tw->tw_tb->owners); spin_unlock(&bhead->lock); spin_lock(lock); inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ if (__sk_nulls_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(lock); /* tw_refcnt is set to 3 because we have : * - one reference for bhash chain. * - one reference for ehash chain. * - one reference for timer. * We can use atomic_set() because prior spin_lock()/spin_unlock() * committed into memory all tw fields. * Also note that after this point, we lost our implicit reference * so we are not allowed to use tw anymore. */ refcount_set(&tw->tw_refcnt, 3); } EXPORT_SYMBOL_GPL(inet_twsk_hashdance); static void tw_timer_handler(struct timer_list *t) { struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer); if (tw->tw_kill) __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITKILLED); else __NET_INC_STATS(twsk_net(tw), LINUX_MIB_TIMEWAITED); inet_twsk_kill(tw); } struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, struct inet_timewait_death_row *dr, const int state) { struct inet_timewait_sock *tw; if (atomic_read(&dr->tw_count) >= dr->sysctl_max_tw_buckets) return NULL; tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, GFP_ATOMIC); if (tw) { const struct inet_sock *inet = inet_sk(sk); tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; tw->tw_rcv_saddr = inet->inet_rcv_saddr; tw->tw_bound_dev_if = sk->sk_bound_dev_if; tw->tw_tos = inet->tos; tw->tw_num = inet->inet_num; tw->tw_state = TCP_TIME_WAIT; tw->tw_substate = state; tw->tw_sport = inet->inet_sport; tw->tw_dport = inet->inet_dport; tw->tw_family = sk->sk_family; tw->tw_reuse = sk->sk_reuse; tw->tw_reuseport = sk->sk_reuseport; tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; tw->tw_transparent = inet->transparent; tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED); /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this * timewait socket. */ refcount_set(&tw->tw_refcnt, 0); __module_get(tw->tw_prot->owner); } return tw; } EXPORT_SYMBOL_GPL(inet_twsk_alloc); /* These are always called from BH context. See callers in * tcp_input.c to verify this. */ /* This is for handling early-kills of TIME_WAIT sockets. * Warning : consume reference. * Caller should not access tw anymore. */ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw) { if (del_timer_sync(&tw->tw_timer)) inet_twsk_kill(tw); inet_twsk_put(tw); } EXPORT_SYMBOL(inet_twsk_deschedule_put); void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) { /* timeout := RTO * 3.5 * * 3.5 = 1+2+0.5 to wait for two retransmits. * * RATIONALE: if FIN arrived and we entered TIME-WAIT state, * our ACK acking that FIN can be lost. If N subsequent retransmitted * FINs (or previous seqments) are lost (probability of such event * is p^(N+1), where p is probability to lose single packet and * time to detect the loss is about RTO*(2^N - 1) with exponential * backoff). Normal timewait length is calculated so, that we * waited at least for one retransmitted FIN (maximal RTO is 120sec). * [ BTW Linux. following BSD, violates this requirement waiting * only for 60sec, we should wait at least for 240 secs. * Well, 240 consumes too much of resources 8) * ] * This interval is not reduced to catch old duplicate and * responces to our wandering segments living for two MSLs. * However, if we use PAWS to detect * old duplicates, we can reduce the interval to bounds required * by RTO, rather than MSL. So, if peer understands PAWS, we * kill tw bucket after 3.5*RTO (it is important that this number * is greater than TS tick!) and detect old duplicates with help * of PAWS. */ tw->tw_kill = timeo <= 4*HZ; if (!rearm) { BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo)); atomic_inc(&tw->tw_dr->tw_count); } else { mod_timer_pending(&tw->tw_timer, jiffies + timeo); } } EXPORT_SYMBOL_GPL(__inet_twsk_schedule); /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */ void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family) { struct hlist_nulls_node *node; unsigned int slot; struct sock *sk; for (slot = 0; slot <= hashinfo->ehash_mask; slot++) { struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; restart_rcu: cond_resched(); rcu_read_lock(); restart: sk_nulls_for_each_rcu(sk, node, &head->chain) { int state = inet_sk_state_load(sk); if ((1 << state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV)) continue; if (sk->sk_family != family || refcount_read(&sock_net(sk)->ns.count)) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; if (unlikely(sk->sk_family != family || refcount_read(&sock_net(sk)->ns.count))) { sock_gen_put(sk); goto restart; } rcu_read_unlock(); local_bh_disable(); if (state == TCP_TIME_WAIT) { inet_twsk_deschedule_put(inet_twsk(sk)); } else { struct request_sock *req = inet_reqsk(sk); inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); } local_bh_enable(); goto restart_rcu; } /* If the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (get_nulls_value(node) != slot) goto restart; rcu_read_unlock(); } } EXPORT_SYMBOL_GPL(inet_twsk_purge);
251 83 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PGTABLE_H #define _LINUX_PGTABLE_H #include <linux/pfn.h> #include <asm/pgtable.h> #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU #include <linux/mm_types.h> #include <linux/bug.h> #include <linux/errno.h> #include <asm-generic/pgtable_uffd.h> #if 5 - defined(__PAGETABLE_P4D_FOLDED) - defined(__PAGETABLE_PUD_FOLDED) - \ defined(__PAGETABLE_PMD_FOLDED) != CONFIG_PGTABLE_LEVELS #error CONFIG_PGTABLE_LEVELS is not consistent with __PAGETABLE_{P4D,PUD,PMD}_FOLDED #endif /* * On almost all architectures and configurations, 0 can be used as the * upper ceiling to free_pgtables(): on many architectures it has the same * effect as using TASK_SIZE. However, there is one configuration which * must impose a more careful limit, to avoid freeing kernel pgtables. */ #ifndef USER_PGTABLES_CEILING #define USER_PGTABLES_CEILING 0UL #endif /* * This defines the first usable user address. Platforms * can override its value with custom FIRST_USER_ADDRESS * defined in their respective <asm/pgtable.h>. */ #ifndef FIRST_USER_ADDRESS #define FIRST_USER_ADDRESS 0UL #endif /* * This defines the generic helper for accessing PMD page * table page. Although platforms can still override this * via their respective <asm/pgtable.h>. */ #ifndef pmd_pgtable #define pmd_pgtable(pmd) pmd_page(pmd) #endif /* * A page table page can be thought of an array like this: pXd_t[PTRS_PER_PxD] * * The pXx_index() functions return the index of the entry in the page * table page which would control the given virtual address * * As these functions may be used by the same code for different levels of * the page table folding, they are always available, regardless of * CONFIG_PGTABLE_LEVELS value. For the folded levels they simply return 0 * because in such cases PTRS_PER_PxD equals 1. */ static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); } #define pte_index pte_index #ifndef pmd_index static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } #define pmd_index pmd_index #endif #ifndef pud_index static inline unsigned long pud_index(unsigned long address) { return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } #define pud_index pud_index #endif #ifndef pgd_index /* Must be a compile-time constant, so implement it as a macro */ #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address); } #define pte_offset_kernel pte_offset_kernel #endif #if defined(CONFIG_HIGHPTE) #define pte_offset_map(dir, address) \ ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \ pte_index((address))) #define pte_unmap(pte) kunmap_atomic((pte)) #else #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) #define pte_unmap(pte) ((void)(pte)) /* NOP */ #endif /* Find an entry in the second-level page table.. */ #ifndef pmd_offset static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return pud_pgtable(*pud) + pmd_index(address); } #define pmd_offset pmd_offset #endif #ifndef pud_offset static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) { return p4d_pgtable(*p4d) + pud_index(address); } #define pud_offset pud_offset #endif static inline pgd_t *pgd_offset_pgd(pgd_t *pgd, unsigned long address) { return (pgd + pgd_index(address)); }; /* * a shortcut to get a pgd_t in a given mm */ #ifndef pgd_offset #define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) #endif /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's */ #ifndef pgd_offset_k #define pgd_offset_k(address) pgd_offset(&init_mm, (address)) #endif /* * In many cases it is known that a virtual address is mapped at PMD or PTE * level, so instead of traversing all the page table levels, we can get a * pointer to the PMD entry in user or kernel page table or translate a virtual * address to the pointer in the PTE in the kernel page tables with simple * helpers. */ static inline pmd_t *pmd_off(struct mm_struct *mm, unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset(mm, va), va), va), va); } static inline pmd_t *pmd_off_k(unsigned long va) { return pmd_offset(pud_offset(p4d_offset(pgd_offset_k(va), va), va), va); } static inline pte_t *virt_to_kpte(unsigned long vaddr) { pmd_t *pmd = pmd_off_k(vaddr); return pmd_none(*pmd) ? NULL : pte_offset_kernel(pmd, vaddr); } #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty); extern int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty); #else static inline int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t entry, int dirty) { BUILD_BUG(); return 0; } static inline int pudp_set_access_flags(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, pud_t entry, int dirty) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { pte_t pte = *ptep; int r = 1; if (!pte_young(pte)) r = 0; else set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); return r; } #endif #ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; int r = 1; if (!pmd_young(pmd)) r = 0; else set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); return r; } #else static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH int ptep_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else /* * Despite relevant to THP only, this API is called from generic rmap code * under PageTransHuge(), hence needs a dummy implementation for !THP */ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return 0; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t pte = *ptep; pte_clear(mm, address, ptep); return pte; } #endif #ifndef __HAVE_ARCH_PTEP_GET static inline pte_t ptep_get(pte_t *ptep) { return READ_ONCE(*ptep); } #endif #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH /* * WARNING: only to be used in the get_user_pages_fast() implementation. * * With get_user_pages_fast(), we walk down the pagetables without taking any * locks. For this we would like to load the pointers atomically, but sometimes * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What * we do have is the guarantee that a PTE will only either go from not present * to present, or present to not present or both -- it will not switch to a * completely different present page without a TLB flush in between; something * that we are blocking by holding interrupts off. * * Setting ptes from not present to present goes: * * ptep->pte_high = h; * smp_wmb(); * ptep->pte_low = l; * * And present to not present goes: * * ptep->pte_low = 0; * smp_wmb(); * ptep->pte_high = 0; * * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'. * We load pte_high *after* loading pte_low, which ensures we don't see an older * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't * picked up a changed pte high. We might have gotten rubbish values from * pte_low and pte_high, but we are guaranteed that pte_low will not have the * present bit set *unless* it is 'l'. Because get_user_pages_fast() only * operates on present ptes we're safe. */ static inline pte_t ptep_get_lockless(pte_t *ptep) { pte_t pte; do { pte.pte_low = ptep->pte_low; smp_rmb(); pte.pte_high = ptep->pte_high; smp_rmb(); } while (unlikely(pte.pte_low != ptep->pte_low)); return pte; } #else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ /* * We require that the PTE can be read atomically. */ static inline pte_t ptep_get_lockless(pte_t *ptep) { return ptep_get(ptep); } #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t pmd = *pmdp; pmd_clear(pmdp); return pmd; } #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */ #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t pud = *pudp; pud_clear(pudp); return pud; } #endif /* __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR_FULL static inline pmd_t pmdp_huge_get_and_clear_full(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, int full) { return pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); } #endif #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR_FULL static inline pud_t pudp_huge_get_and_clear_full(struct mm_struct *mm, unsigned long address, pud_t *pudp, int full) { return pudp_huge_get_and_clear(mm, address, pudp); } #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { pte_t pte; pte = ptep_get_and_clear(mm, address, ptep); return pte; } #endif /* * If two threads concurrently fault at the same page, the thread that * won the race updates the PTE and its local TLB/Cache. The other thread * gives up, simply does nothing, and continues; on architectures where * software can update TLB, local TLB can be updated here to avoid next page * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ #ifndef __HAVE_ARCH_UPDATE_MMU_TLB static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { } #define __HAVE_ARCH_UPDATE_MMU_TLB #endif /* * Some architectures may be able to avoid expensive synchronization * primitives when modifications are made to PTE's which are already * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL static inline void pte_clear_not_present_full(struct mm_struct *mm, unsigned long address, pte_t *ptep, int full) { pte_clear(mm, address, ptep); } #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH extern pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); #endif #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH extern pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); extern pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, pud_t *pudp); #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT struct mm_struct; static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { pte_t old_pte = *ptep; set_pte_at(mm, address, ptep, pte_wrprotect(old_pte)); } #endif /* * On some architectures hardware does not set page access bit when accessing * memory page, it is responsibility of software setting this bit. It brings * out extra page fault penalty to track page access bit. For optimization page * access bit can be set during all page fault flow on these arches. * To be differentiate with macro pte_mkyoung, this macro is used on platforms * where software maintains page access bit. */ #ifndef pte_sw_mkyoung static inline pte_t pte_sw_mkyoung(pte_t pte) { return pte; } #define pte_sw_mkyoung pte_sw_mkyoung #endif #ifndef pte_savedwrite #define pte_savedwrite pte_write #endif #ifndef pte_mk_savedwrite #define pte_mk_savedwrite pte_mkwrite #endif #ifndef pte_clear_savedwrite #define pte_clear_savedwrite pte_wrprotect #endif #ifndef pmd_savedwrite #define pmd_savedwrite pmd_write #endif #ifndef pmd_mk_savedwrite #define pmd_mk_savedwrite pmd_mkwrite #endif #ifndef pmd_clear_savedwrite #define pmd_clear_savedwrite pmd_wrprotect #endif #ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { pmd_t old_pmd = *pmdp; set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); } #else static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PUDP_SET_WRPROTECT #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { pud_t old_pud = *pudp; set_pud_at(mm, address, pudp, pud_wrprotect(old_pud)); } #else static inline void pudp_set_wrprotect(struct mm_struct *mm, unsigned long address, pud_t *pudp) { BUILD_BUG(); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ #endif #ifndef pmdp_collapse_flush #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #else static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { BUILD_BUG(); return *pmdp; } #define pmdp_collapse_flush pmdp_collapse_flush #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); #endif #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * This is an implementation of pmdp_establish() that is only suitable for an * architecture that doesn't have hardware dirty/accessed bits. In this case we * can't race with CPU which sets these bits and non-atomic approach is fine. */ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { pmd_t old_pmd = *pmdp; set_pmd_at(vma->vm_mm, address, pmdp, pmd); return old_pmd; } #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { return pte_val(pte_a) == pte_val(pte_b); } #endif #ifndef __HAVE_ARCH_PTE_UNUSED /* * Some architectures provide facilities to virtualization guests * so that they can flag allocated pages as unused. This allows the * host to transparently reclaim unused pages. This function returns * whether the pte's page is unused. */ static inline int pte_unused(pte_t pte) { return 0; } #endif #ifndef pte_access_permitted #define pte_access_permitted(pte, write) \ (pte_present(pte) && (!(write) || pte_write(pte))) #endif #ifndef pmd_access_permitted #define pmd_access_permitted(pmd, write) \ (pmd_present(pmd) && (!(write) || pmd_write(pmd))) #endif #ifndef pud_access_permitted #define pud_access_permitted(pud, write) \ (pud_present(pud) && (!(write) || pud_write(pud))) #endif #ifndef p4d_access_permitted #define p4d_access_permitted(p4d, write) \ (p4d_present(p4d) && (!(write) || p4d_write(p4d))) #endif #ifndef pgd_access_permitted #define pgd_access_permitted(pgd, write) \ (pgd_present(pgd) && (!(write) || pgd_write(pgd))) #endif #ifndef __HAVE_ARCH_PMD_SAME static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) { return pmd_val(pmd_a) == pmd_val(pmd_b); } static inline int pud_same(pud_t pud_a, pud_t pud_b) { return pud_val(pud_a) == pud_val(pud_b); } #endif #ifndef __HAVE_ARCH_P4D_SAME static inline int p4d_same(p4d_t p4d_a, p4d_t p4d_b) { return p4d_val(p4d_a) == p4d_val(p4d_b); } #endif #ifndef __HAVE_ARCH_PGD_SAME static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) { return pgd_val(pgd_a) == pgd_val(pgd_b); } #endif /* * Use set_p*_safe(), and elide TLB flushing, when confident that *no* * TLB flush will be required as a result of the "set". For example, use * in scenarios where it is known ahead of time that the routine is * setting non-present entries, or re-setting an existing entry to the * same value. Otherwise, use the typical "set" helpers and flush the * TLB. */ #define set_pte_safe(ptep, pte) \ ({ \ WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ set_pte(ptep, pte); \ }) #define set_pmd_safe(pmdp, pmd) \ ({ \ WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ set_pmd(pmdp, pmd); \ }) #define set_pud_safe(pudp, pud) \ ({ \ WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ set_pud(pudp, pud); \ }) #define set_p4d_safe(p4dp, p4d) \ ({ \ WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ set_p4d(p4dp, p4d); \ }) #define set_pgd_safe(pgdp, pgd) \ ({ \ WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ set_pgd(pgdp, pgd); \ }) #ifndef __HAVE_ARCH_DO_SWAP_PAGE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ static inline void arch_do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t pte, pte_t oldpte) { } #endif #ifndef __HAVE_ARCH_UNMAP_ONE /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be * restored when the page is swapped back in. SPARC M7 and newer * processors support an ADI (Application Data Integrity) tag for the * page as metadata for the page. arch_unmap_one() can save this * metadata on a swap-out of a page. */ static inline int arch_unmap_one(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t orig_pte) { return 0; } #endif /* * Allow architectures to preserve additional metadata associated with * swapped-out pages. The corresponding __HAVE_ARCH_SWAP_* macros and function * prototypes must be defined in the arch-specific asm/pgtable.h file. */ #ifndef __HAVE_ARCH_PREPARE_TO_SWAP static inline int arch_prepare_to_swap(struct page *page) { return 0; } #endif #ifndef __HAVE_ARCH_SWAP_INVALIDATE static inline void arch_swap_invalidate_page(int type, pgoff_t offset) { } static inline void arch_swap_invalidate_area(int type) { } #endif #ifndef __HAVE_ARCH_SWAP_RESTORE static inline void arch_swap_restore(swp_entry_t entry, struct page *page) { } #endif #ifndef __HAVE_ARCH_PGD_OFFSET_GATE #define pgd_offset_gate(mm, addr) pgd_offset(mm, addr) #endif #ifndef __HAVE_ARCH_MOVE_PTE #define move_pte(pte, prot, old_addr, new_addr) (pte) #endif #ifndef pte_accessible # define pte_accessible(mm, pte) ((void)(pte), 1) #endif #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no * vma end wraps to 0, rounded up __boundary may wrap to 0 throughout. */ #define pgd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PGDIR_SIZE) & PGDIR_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #ifndef p4d_addr_end #define p4d_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + P4D_SIZE) & P4D_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pud_addr_end #define pud_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PUD_SIZE) & PUD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif #ifndef pmd_addr_end #define pmd_addr_end(addr, end) \ ({ unsigned long __boundary = ((addr) + PMD_SIZE) & PMD_MASK; \ (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) #endif /* * When walking page tables, we usually want to skip any p?d_none entries; * and any p?d_bad entries - reporting the error before resetting to none. * Do the tests inline, but report and clear the bad entry in mm/memory.c. */ void pgd_clear_bad(pgd_t *); #ifndef __PAGETABLE_P4D_FOLDED void p4d_clear_bad(p4d_t *); #else #define p4d_clear_bad(p4d) do { } while (0) #endif #ifndef __PAGETABLE_PUD_FOLDED void pud_clear_bad(pud_t *); #else #define pud_clear_bad(p4d) do { } while (0) #endif void pmd_clear_bad(pmd_t *); static inline int pgd_none_or_clear_bad(pgd_t *pgd) { if (pgd_none(*pgd)) return 1; if (unlikely(pgd_bad(*pgd))) { pgd_clear_bad(pgd); return 1; } return 0; } static inline int p4d_none_or_clear_bad(p4d_t *p4d) { if (p4d_none(*p4d)) return 1; if (unlikely(p4d_bad(*p4d))) { p4d_clear_bad(p4d); return 1; } return 0; } static inline int pud_none_or_clear_bad(pud_t *pud) { if (pud_none(*pud)) return 1; if (unlikely(pud_bad(*pud))) { pud_clear_bad(pud); return 1; } return 0; } static inline int pmd_none_or_clear_bad(pmd_t *pmd) { if (pmd_none(*pmd)) return 1; if (unlikely(pmd_bad(*pmd))) { pmd_clear_bad(pmd); return 1; } return 0; } static inline pte_t __ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* * Get the current pte state, but zero it out to make it * non-present, preventing the hardware from asynchronously * updating it. */ return ptep_get_and_clear(vma->vm_mm, addr, ptep); } static inline void __ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte) { /* * The pte is non-present, so there's no hardware state to * preserve. */ set_pte_at(vma->vm_mm, addr, ptep, pte); } #ifndef __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION /* * Start a pte protection read-modify-write transaction, which * protects against asynchronous hardware modifications to the pte. * The intention is not to prevent the hardware from making pte * updates, but to prevent any updates it may make from being lost. * * This does not protect against other software modifications of the * pte; the appropriate pte lock must be held over the transaction. * * Note that this interface is intended to be batchable, meaning that * ptep_modify_prot_commit may not actually update the pte, but merely * queue the update to be done at some later time. The update must be * actually committed before the pte lock is released, however. */ static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { return __ptep_modify_prot_start(vma, addr, ptep); } /* * Commit an update to a pte, leaving any hardware-controlled bits in * the PTE unmodified. */ static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { __ptep_modify_prot_commit(vma, addr, ptep, pte); } #endif /* __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION */ #endif /* CONFIG_MMU */ /* * No-op macros that just return the current protection value. Defined here * because these macros can be used even if CONFIG_MMU is not defined. */ #ifndef pgprot_nx #define pgprot_nx(prot) (prot) #endif #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif #ifndef pgprot_writecombine #define pgprot_writecombine pgprot_noncached #endif #ifndef pgprot_writethrough #define pgprot_writethrough pgprot_noncached #endif #ifndef pgprot_device #define pgprot_device pgprot_noncached #endif #ifndef pgprot_mhp #define pgprot_mhp(prot) (prot) #endif #ifdef CONFIG_MMU #ifndef pgprot_modify #define pgprot_modify pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot))) newprot = pgprot_noncached(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot))) newprot = pgprot_writecombine(newprot); if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot))) newprot = pgprot_device(newprot); return newprot; } #endif #endif /* CONFIG_MMU */ #ifndef pgprot_encrypted #define pgprot_encrypted(prot) (prot) #endif #ifndef pgprot_decrypted #define pgprot_decrypted(prot) (prot) #endif /* * A facility to provide lazy MMU batching. This allows PTE updates and * page invalidations to be delayed until a call to leave lazy MMU mode * is issued. Some architectures may benefit from doing this, and it is * beneficial for both shadow and direct mode hypervisors, which may batch * the PTE updates which happen during this window. Note that using this * interface requires that read hazards be removed from the code. A read * hazard could result in the direct mode hypervisor case, since the actual * write to the page tables may not yet have taken place, so reads though * a raw PTE pointer after it has been modified are not guaranteed to be * up to date. This mode can only be entered and left under the protection of * the page table locks for all page tables which may be modified. In the UP * case, this is required so that preemption is disabled, and in the SMP case, * it must synchronize the delayed page table writes properly on other CPUs. */ #ifndef __HAVE_ARCH_ENTER_LAZY_MMU_MODE #define arch_enter_lazy_mmu_mode() do {} while (0) #define arch_leave_lazy_mmu_mode() do {} while (0) #define arch_flush_lazy_mmu_mode() do {} while (0) #endif /* * A facility to provide batching of the reload of page tables and * other process state with the actual context switch code for * paravirtualized guests. By convention, only one of the batched * update (lazy) modes (CPU, MMU) should be active at any given time, * entry should never be nested, and entry and exits should always be * paired. This is for sanity of maintaining and reasoning about the * kernel code. In this case, the exit (end of the context switch) is * in architecture-specific code, and so doesn't need a generic * definition. */ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH #define arch_start_context_switch(prev) do {} while (0) #endif #ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY #ifndef CONFIG_ARCH_ENABLE_THP_MIGRATION static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #else /* !CONFIG_HAVE_ARCH_SOFT_DIRTY */ static inline int pte_soft_dirty(pte_t pte) { return 0; } static inline int pmd_soft_dirty(pmd_t pmd) { return 0; } static inline pte_t pte_mksoft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_mksoft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd) { return pmd; } static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { return pte; } static inline int pte_swp_soft_dirty(pte_t pte) { return 0; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { return pte; } static inline pmd_t pmd_swp_mksoft_dirty(pmd_t pmd) { return pmd; } static inline int pmd_swp_soft_dirty(pmd_t pmd) { return 0; } static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) { return pmd; } #endif #ifndef __HAVE_PFNMAP_TRACKING /* * Interfaces that can be used by architecture code to keep track of * memory type of pfn mappings specified by the remap_pfn_range, * vmf_insert_pfn. */ /* * track_pfn_remap is called when a _new_ pfn mapping is being established * by remap_pfn_range() for physical range indicated by pfn and size. */ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) { return 0; } /* * track_pfn_insert is called when a _new_ single pfn is established * by vmf_insert_pfn(). */ static inline void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) { } /* * track_pfn_copy is called when vma that is covering the pfnmap gets * copied through copy_page_range(). */ static inline int track_pfn_copy(struct vm_area_struct *vma) { return 0; } /* * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or * can be for the entire vma (in which case pfn, size are zero). */ static inline void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size) { } /* * untrack_pfn_moved is called while mremapping a pfnmap for a new region. */ static inline void untrack_pfn_moved(struct vm_area_struct *vma) { } #else extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size); extern void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn); extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size); extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif #ifdef CONFIG_MMU #ifdef __HAVE_COLOR_ZERO_PAGE static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; unsigned long offset_from_zero_pfn = pfn - zero_pfn; return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); } #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) #else static inline int is_zero_pfn(unsigned long pfn) { extern unsigned long zero_pfn; return pfn == zero_pfn; } static inline unsigned long my_zero_pfn(unsigned long addr) { extern unsigned long zero_pfn; return zero_pfn; } #endif #else static inline int is_zero_pfn(unsigned long pfn) { return 0; } static inline unsigned long my_zero_pfn(unsigned long addr) { return 0; } #endif /* CONFIG_MMU */ #ifdef CONFIG_MMU #ifndef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_trans_huge(pmd_t pmd) { return 0; } #ifndef pmd_write static inline int pmd_write(pmd_t pmd) { BUG(); return 0; } #endif /* pmd_write */ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifndef pud_write static inline int pud_write(pud_t pud) { BUG(); return 0; } #endif /* pud_write */ #if !defined(CONFIG_ARCH_HAS_PTE_DEVMAP) || !defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline int pmd_devmap(pmd_t pmd) { return 0; } static inline int pud_devmap(pud_t pud) { return 0; } static inline int pgd_devmap(pgd_t pgd) { return 0; } #endif #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \ (defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)) static inline int pud_trans_huge(pud_t pud) { return 0; } #endif /* See pmd_none_or_trans_huge_or_clear_bad for discussion. */ static inline int pud_none_or_trans_huge_or_dev_or_clear_bad(pud_t *pud) { pud_t pudval = READ_ONCE(*pud); if (pud_none(pudval) || pud_trans_huge(pudval) || pud_devmap(pudval)) return 1; if (unlikely(pud_bad(pudval))) { pud_clear_bad(pud); return 1; } return 0; } /* See pmd_trans_unstable for discussion. */ static inline int pud_trans_unstable(pud_t *pud) { #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) return pud_none_or_trans_huge_or_dev_or_clear_bad(pud); #else return 0; #endif } #ifndef pmd_read_atomic static inline pmd_t pmd_read_atomic(pmd_t *pmdp) { /* * Depend on compiler for an atomic pmd read. NOTE: this is * only going to work, if the pmdval_t isn't larger than * an unsigned long. */ return *pmdp; } #endif #ifndef arch_needs_pgtable_deposit #define arch_needs_pgtable_deposit() (false) #endif /* * This function is meant to be used by sites walking pagetables with * the mmap_lock held in read mode to protect against MADV_DONTNEED and * transhuge page faults. MADV_DONTNEED can convert a transhuge pmd * into a null pmd and the transhuge page fault can convert a null pmd * into an hugepmd or into a regular pmd (if the hugepage allocation * fails). While holding the mmap_lock in read mode the pmd becomes * stable and stops changing under us only if it's not null and not a * transhuge pmd. When those races occurs and this function makes a * difference vs the standard pmd_none_or_clear_bad, the result is * undefined so behaving like if the pmd was none is safe (because it * can return none anyway). The compiler level barrier() is critically * important to compute the two checks atomically on the same pmdval. * * For 32bit kernels with a 64bit large pmd_t this automatically takes * care of reading the pmd atomically to avoid SMP race conditions * against pmd_populate() when the mmap_lock is hold for reading by the * caller (a special atomic read not done by "gcc" as in the generic * version above, is also needed when THP is disabled because the page * fault can populate the pmd from under us). */ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) { pmd_t pmdval = pmd_read_atomic(pmd); /* * The barrier will stabilize the pmdval in a register or on * the stack so that it will stop changing under the code. * * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE, * pmd_read_atomic is allowed to return a not atomic pmdval * (for example pointing to an hugepage that has never been * mapped in the pmd). The below checks will only care about * the low part of the pmd with 32bit PAE x86 anyway, with the * exception of pmd_none(). So the important thing is that if * the low part of the pmd is found null, the high part will * be also null or the pmd_none() check below would be * confused. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE barrier(); #endif /* * !pmd_present() checks for pmd migration entries * * The complete check uses is_pmd_migration_entry() in linux/swapops.h * But using that requires moving current function and pmd_trans_unstable() * to linux/swapops.h to resolve dependency, which is too much code move. * * !pmd_present() is equivalent to is_pmd_migration_entry() currently, * because !pmd_present() pages can only be under migration not swapped * out. * * pmd_none() is preserved for future condition checks on pmd migration * entries and not confusing with this function name, although it is * redundant with !pmd_present(). */ if (pmd_none(pmdval) || pmd_trans_huge(pmdval) || (IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION) && !pmd_present(pmdval))) return 1; if (unlikely(pmd_bad(pmdval))) { pmd_clear_bad(pmd); return 1; } return 0; } /* * This is a noop if Transparent Hugepage Support is not built into * the kernel. Otherwise it is equivalent to * pmd_none_or_trans_huge_or_clear_bad(), and shall only be called in * places that already verified the pmd is not none and they want to * walk ptes while holding the mmap sem in read mode (write mode don't * need this). If THP is not enabled, the pmd can't go away under the * code even if MADV_DONTNEED runs, but if THP is enabled we need to * run a pmd_trans_unstable before walking the ptes after * split_huge_pmd returns (because it may have run when the pmd become * null, but then a page fault can map in a THP and not a regular page). */ static inline int pmd_trans_unstable(pmd_t *pmd) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE return pmd_none_or_trans_huge_or_clear_bad(pmd); #else return 0; #endif } /* * the ordering of these checks is important for pmds with _page_devmap set. * if we check pmd_trans_unstable() first we will trip the bad_pmd() check * inside of pmd_none_or_trans_huge_or_clear_bad(). this will end up correctly * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. */ static inline int pmd_devmap_trans_unstable(pmd_t *pmd) { return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); } #ifndef CONFIG_NUMA_BALANCING /* * Technically a PTE can be PROTNONE even when not doing NUMA balancing but * the only case the kernel cares is for NUMA balancing and is only ever set * when the VMA is accessible. For PROT_NONE VMAs, the PTEs are not marked * _PAGE_PROTNONE so by default, implement the helper as "always no". It * is the responsibility of the caller to distinguish between PROT_NONE * protections and NUMA hinting fault protections. */ static inline int pte_protnone(pte_t pte) { return 0; } static inline int pmd_protnone(pmd_t pmd) { return 0; } #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_MMU */ #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP #ifndef __PAGETABLE_P4D_FOLDED int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot); int p4d_clear_huge(p4d_t *p4d); #else static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int p4d_clear_huge(p4d_t *p4d) { return 0; } #endif /* !__PAGETABLE_P4D_FOLDED */ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pud_clear_huge(pud_t *pud); int pmd_clear_huge(pmd_t *pmd); int p4d_free_pud_page(p4d_t *p4d, unsigned long addr); int pud_free_pmd_page(pud_t *pud, unsigned long addr); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr); #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) { return 0; } static inline int p4d_clear_huge(p4d_t *p4d) { return 0; } static inline int pud_clear_huge(pud_t *pud) { return 0; } static inline int pmd_clear_huge(pmd_t *pmd) { return 0; } static inline int p4d_free_pud_page(p4d_t *p4d, unsigned long addr) { return 0; } static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr) { return 0; } static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr) { return 0; } #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #ifndef __HAVE_ARCH_FLUSH_PMD_TLB_RANGE #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * ARCHes with special requirements for evicting THP backing TLB entries can * implement this. Otherwise also, it can help optimize normal TLB flush in * THP regime. Stock flush_tlb_range() typically has optimization to nuke the * entire TLB if flush span is greater than a threshold, which will * likely be true for a single huge page. Thus a single THP flush will * invalidate the entire TLB which is not desirable. * e.g. see arch/arc: flush_pmd_tlb_range */ #define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #define flush_pud_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #else #define flush_pmd_tlb_range(vma, addr, end) BUILD_BUG() #define flush_pud_tlb_range(vma, addr, end) BUILD_BUG() #endif #endif struct file; int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot); #ifndef CONFIG_X86_ESPFIX64 static inline void init_espfix_bsp(void) { } #endif extern void __init pgtable_cache_init(void); #ifndef __HAVE_ARCH_PFN_MODIFY_ALLOWED static inline bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot) { return true; } static inline bool arch_has_pfn_modify_check(void) { return false; } #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */ /* * Architecture PAGE_KERNEL_* fallbacks * * Some architectures don't define certain PAGE_KERNEL_* flags. This is either * because they really don't support them, or the port needs to be updated to * reflect the required functionality. Below are a set of relatively safe * fallbacks, as best effort, which we can count on in lieu of the architectures * not defining them on their own yet. */ #ifndef PAGE_KERNEL_RO # define PAGE_KERNEL_RO PAGE_KERNEL #endif #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif /* * Page Table Modification bits for pgtbl_mod_mask. * * These are used by the p?d_alloc_track*() set of functions an in the generic * vmalloc/ioremap code to track at which page-table levels entries have been * modified. Based on that the code can better decide when vmalloc and ioremap * mapping changes need to be synchronized to other page-tables in the system. */ #define __PGTBL_PGD_MODIFIED 0 #define __PGTBL_P4D_MODIFIED 1 #define __PGTBL_PUD_MODIFIED 2 #define __PGTBL_PMD_MODIFIED 3 #define __PGTBL_PTE_MODIFIED 4 #define PGTBL_PGD_MODIFIED BIT(__PGTBL_PGD_MODIFIED) #define PGTBL_P4D_MODIFIED BIT(__PGTBL_P4D_MODIFIED) #define PGTBL_PUD_MODIFIED BIT(__PGTBL_PUD_MODIFIED) #define PGTBL_PMD_MODIFIED BIT(__PGTBL_PMD_MODIFIED) #define PGTBL_PTE_MODIFIED BIT(__PGTBL_PTE_MODIFIED) /* Page-Table Modification Mask */ typedef unsigned int pgtbl_mod_mask; #endif /* !__ASSEMBLY__ */ #if !defined(MAX_POSSIBLE_PHYSMEM_BITS) && !defined(CONFIG_64BIT) #ifdef CONFIG_PHYS_ADDR_T_64BIT /* * ZSMALLOC needs to know the highest PFN on 32-bit architectures * with physical address space extension, but falls back to * BITS_PER_LONG otherwise. */ #error Missing MAX_POSSIBLE_PHYSMEM_BITS definition #else #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif #endif #ifndef has_transparent_hugepage #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define has_transparent_hugepage() 1 #else #define has_transparent_hugepage() 0 #endif #endif /* * On some architectures it depends on the mm if the p4d/pud or pmd * layer of the page table hierarchy is folded or not. */ #ifndef mm_p4d_folded #define mm_p4d_folded(mm) __is_defined(__PAGETABLE_P4D_FOLDED) #endif #ifndef mm_pud_folded #define mm_pud_folded(mm) __is_defined(__PAGETABLE_PUD_FOLDED) #endif #ifndef mm_pmd_folded #define mm_pmd_folded(mm) __is_defined(__PAGETABLE_PMD_FOLDED) #endif #ifndef p4d_offset_lockless #define p4d_offset_lockless(pgdp, pgd, address) p4d_offset(&(pgd), address) #endif #ifndef pud_offset_lockless #define pud_offset_lockless(p4dp, p4d, address) pud_offset(&(p4d), address) #endif #ifndef pmd_offset_lockless #define pmd_offset_lockless(pudp, pud, address) pmd_offset(&(pud), address) #endif /* * p?d_leaf() - true if this entry is a final mapping to a physical address. * This differs from p?d_huge() by the fact that they are always available (if * the architecture supports large pages at the appropriate level) even * if CONFIG_HUGETLB_PAGE is not defined. * Only meaningful when called on a valid entry. */ #ifndef pgd_leaf #define pgd_leaf(x) 0 #endif #ifndef p4d_leaf #define p4d_leaf(x) 0 #endif #ifndef pud_leaf #define pud_leaf(x) 0 #endif #ifndef pmd_leaf #define pmd_leaf(x) 0 #endif #ifndef pgd_leaf_size #define pgd_leaf_size(x) (1ULL << PGDIR_SHIFT) #endif #ifndef p4d_leaf_size #define p4d_leaf_size(x) P4D_SIZE #endif #ifndef pud_leaf_size #define pud_leaf_size(x) PUD_SIZE #endif #ifndef pmd_leaf_size #define pmd_leaf_size(x) PMD_SIZE #endif #ifndef pte_leaf_size #define pte_leaf_size(x) PAGE_SIZE #endif /* * Some architectures have MMUs that are configurable or selectable at boot * time. These lead to variable PTRS_PER_x. For statically allocated arrays it * helps to have a static maximum value. */ #ifndef MAX_PTRS_PER_PTE #define MAX_PTRS_PER_PTE PTRS_PER_PTE #endif #ifndef MAX_PTRS_PER_PMD #define MAX_PTRS_PER_PMD PTRS_PER_PMD #endif #ifndef MAX_PTRS_PER_PUD #define MAX_PTRS_PER_PUD PTRS_PER_PUD #endif #ifndef MAX_PTRS_PER_P4D #define MAX_PTRS_PER_P4D PTRS_PER_P4D #endif #endif /* _LINUX_PGTABLE_H */
83 355 447 448 252 250 355 421 208 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H #include <linux/list.h> #include <linux/spinlock.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/srcu.h> #include <linux/interval_tree.h> struct mmu_notifier_subscriptions; struct mmu_notifier; struct mmu_notifier_range; struct mmu_interval_notifier; /** * enum mmu_notifier_event - reason for the mmu notifier callback * @MMU_NOTIFY_UNMAP: either munmap() that unmap the range or a mremap() that * move the range * * @MMU_NOTIFY_CLEAR: clear page table entry (many reasons for this like * madvise() or replacing a page by another one, ...). * * @MMU_NOTIFY_PROTECTION_VMA: update is due to protection change for the range * ie using the vma access permission (vm_page_prot) to update the whole range * is enough no need to inspect changes to the CPU page table (mprotect() * syscall) * * @MMU_NOTIFY_PROTECTION_PAGE: update is due to change in read/write flag for * pages in the range so to mirror those changes the user must inspect the CPU * page table (from the end callback). * * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same * access flags). User should soft dirty the page in the end callback to make * sure that anyone relying on soft dirtiness catch pages that might be written * through non CPU mappings. * * @MMU_NOTIFY_RELEASE: used during mmu_interval_notifier invalidate to signal * that the mm refcount is zero and the range is no longer accessible. * * @MMU_NOTIFY_MIGRATE: used during migrate_vma_collect() invalidate to signal * a device driver to possibly ignore the invalidation if the * owner field matches the driver's device private pgmap owner. * * @MMU_NOTIFY_EXCLUSIVE: to signal a device driver that the device will no * longer have exclusive access to the page. When sent during creation of an * exclusive range the owner will be initialised to the value provided by the * caller of make_device_exclusive_range(), otherwise the owner will be NULL. */ enum mmu_notifier_event { MMU_NOTIFY_UNMAP = 0, MMU_NOTIFY_CLEAR, MMU_NOTIFY_PROTECTION_VMA, MMU_NOTIFY_PROTECTION_PAGE, MMU_NOTIFY_SOFT_DIRTY, MMU_NOTIFY_RELEASE, MMU_NOTIFY_MIGRATE, MMU_NOTIFY_EXCLUSIVE, }; #define MMU_NOTIFIER_RANGE_BLOCKABLE (1 << 0) struct mmu_notifier_ops { /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are * freed. This can run concurrently with other mmu notifier * methods (the ones invoked outside the mm context) and it * should tear down all secondary mmu mappings and freeze the * secondary mmu. If this method isn't implemented you've to * be sure that nothing could possibly write to the pages * through the secondary mmu by the time the last thread with * tsk->mm == mm exits. * * As side note: the pages freed after ->release returns could * be immediately reallocated by the gart at an alias physical * address with a different cache model, so if ->release isn't * implemented because all _software_ driven memory accesses * through the secondary mmu are terminated by the time the * last thread of this mm quits, you've also to be sure that * speculative _hardware_ operations can't allocate dirty * cachelines in the cpu that could not be snooped and made * coherent with the other read and write operations happening * through the gart alias address, so leading to memory * corruption. */ void (*release)(struct mmu_notifier *subscription, struct mm_struct *mm); /* * clear_flush_young is called after the VM is * test-and-clearing the young/accessed bitflag in the * pte. This way the VM will provide proper aging to the * accesses to the page through the secondary MMUs and not * only to the ones through the Linux pte. * Start-end is necessary in case the secondary MMU is mapping the page * at a smaller granularity than the primary MMU. */ int (*clear_flush_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * clear_young is a lightweight version of clear_flush_young. Like the * latter, it is supposed to test-and-clear the young/accessed bitflag * in the secondary pte, but it may omit flushing the secondary tlb. */ int (*clear_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * test_young is called to check the young/accessed bitflag in * the secondary pte. This is used to know if the page is * frequently used without actually clearing the flag or tearing * down the secondary mapping on the page. */ int (*test_young)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long address); /* * change_pte is called in cases that pte mapping to page is changed: * for example, when ksm remaps pte to point to a new shared page. */ void (*change_pte)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long address, pte_t pte); /* * invalidate_range_start() and invalidate_range_end() must be * paired and are called only when the mmap_lock and/or the * locks protecting the reverse maps are held. If the subsystem * can't guarantee that no additional references are taken to * the pages in the range, it has to implement the * invalidate_range() notifier to remove any references taken * after invalidate_range_start(). * * Invalidation of multiple concurrent ranges may be * optionally permitted by the driver. Either way the * establishment of sptes is forbidden in the range passed to * invalidate_range_begin/end for the whole duration of the * invalidate_range_begin/end critical section. * * invalidate_range_start() is called when all pages in the * range are still mapped and have at least a refcount of one. * * invalidate_range_end() is called when all pages in the * range have been unmapped and the pages have been freed by * the VM. * * The VM will remove the page table entries and potentially * the page between invalidate_range_start() and * invalidate_range_end(). If the page must not be freed * because of pending I/O or other circumstances then the * invalidate_range_start() callback (or the initial mapping * by the driver) must make sure that the refcount is kept * elevated. * * If the driver increases the refcount when the pages are * initially mapped into an address space then either * invalidate_range_start() or invalidate_range_end() may * decrease the refcount. If the refcount is decreased on * invalidate_range_start() then the VM can free pages as page * table entries are removed. If the refcount is only * dropped on invalidate_range_end() then the driver itself * will drop the last refcount but it must take care to flush * any secondary tlb before doing the final free on the * page. Pages will no longer be referenced by the linux * address space but may still be referenced by sptes until * the last refcount is dropped. * * If blockable argument is set to false then the callback cannot * sleep and has to return with -EAGAIN if sleeping would be required. * 0 should be returned otherwise. Please note that notifiers that can * fail invalidate_range_start are not allowed to implement * invalidate_range_end, as there is no mechanism for informing the * notifier that its start failed. */ int (*invalidate_range_start)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); void (*invalidate_range_end)(struct mmu_notifier *subscription, const struct mmu_notifier_range *range); /* * invalidate_range() is either called between * invalidate_range_start() and invalidate_range_end() when the * VM has to free pages that where unmapped, but before the * pages are actually freed, or outside of _start()/_end() when * a (remote) TLB is necessary. * * If invalidate_range() is used to manage a non-CPU TLB with * shared page-tables, it not necessary to implement the * invalidate_range_start()/end() notifiers, as * invalidate_range() already catches the points in time when an * external TLB range needs to be flushed. For more in depth * discussion on this see Documentation/vm/mmu_notifier.rst * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if * called between those functions. */ void (*invalidate_range)(struct mmu_notifier *subscription, struct mm_struct *mm, unsigned long start, unsigned long end); /* * These callbacks are used with the get/put interface to manage the * lifetime of the mmu_notifier memory. alloc_notifier() returns a new * notifier for use with the mm. * * free_notifier() is only called after the mmu_notifier has been * fully put, calls to any ops callback are prevented and no ops * callbacks are currently running. It is called from a SRCU callback * and cannot sleep. */ struct mmu_notifier *(*alloc_notifier)(struct mm_struct *mm); void (*free_notifier)(struct mmu_notifier *subscription); }; /* * The notifier chains are protected by mmap_lock and/or the reverse map * semaphores. Notifier chains are only changed when all reverse maps and * the mmap_lock locks are taken. * * Therefore notifier chains can only be traversed when either * * 1. mmap_lock is held. * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). * 3. No other concurrent thread can access the list (release) */ struct mmu_notifier { struct hlist_node hlist; const struct mmu_notifier_ops *ops; struct mm_struct *mm; struct rcu_head rcu; unsigned int users; }; /** * struct mmu_interval_notifier_ops * @invalidate: Upon return the caller must stop using any SPTEs within this * range. This function can sleep. Return false only if sleeping * was required but mmu_notifier_range_blockable(range) is false. */ struct mmu_interval_notifier_ops { bool (*invalidate)(struct mmu_interval_notifier *interval_sub, const struct mmu_notifier_range *range, unsigned long cur_seq); }; struct mmu_interval_notifier { struct interval_tree_node interval_tree; const struct mmu_interval_notifier_ops *ops; struct mm_struct *mm; struct hlist_node deferred_item; unsigned long invalidate_seq; }; #ifdef CONFIG_MMU_NOTIFIER #ifdef CONFIG_LOCKDEP extern struct lockdep_map __mmu_notifier_invalidate_range_start_map; #endif struct mmu_notifier_range { struct vm_area_struct *vma; struct mm_struct *mm; unsigned long start; unsigned long end; unsigned flags; enum mmu_notifier_event event; void *owner; }; static inline int mm_has_notifiers(struct mm_struct *mm) { return unlikely(mm->notifier_subscriptions); } struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops, struct mm_struct *mm); static inline struct mmu_notifier * mmu_notifier_get(const struct mmu_notifier_ops *ops, struct mm_struct *mm) { struct mmu_notifier *ret; mmap_write_lock(mm); ret = mmu_notifier_get_locked(ops, mm); mmap_write_unlock(mm); return ret; } void mmu_notifier_put(struct mmu_notifier *subscription); void mmu_notifier_synchronize(void); extern int mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); extern int __mmu_notifier_register(struct mmu_notifier *subscription, struct mm_struct *mm); extern void mmu_notifier_unregister(struct mmu_notifier *subscription, struct mm_struct *mm); unsigned long mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub); int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); int mmu_interval_notifier_insert_locked( struct mmu_interval_notifier *interval_sub, struct mm_struct *mm, unsigned long start, unsigned long length, const struct mmu_interval_notifier_ops *ops); void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub); /** * mmu_interval_set_seq - Save the invalidation sequence * @interval_sub - The subscription passed to invalidate * @cur_seq - The cur_seq passed to the invalidate() callback * * This must be called unconditionally from the invalidate callback of a * struct mmu_interval_notifier_ops under the same lock that is used to call * mmu_interval_read_retry(). It updates the sequence number for later use by * mmu_interval_read_retry(). The provided cur_seq will always be odd. * * If the caller does not call mmu_interval_read_begin() or * mmu_interval_read_retry() then this call is not required. */ static inline void mmu_interval_set_seq(struct mmu_interval_notifier *interval_sub, unsigned long cur_seq) { WRITE_ONCE(interval_sub->invalidate_seq, cur_seq); } /** * mmu_interval_read_retry - End a read side critical section against a VA range * interval_sub: The subscription * seq: The return of the paired mmu_interval_read_begin() * * This MUST be called under a user provided lock that is also held * unconditionally by op->invalidate() when it calls mmu_interval_set_seq(). * * Each call should be paired with a single mmu_interval_read_begin() and * should be used to conclude the read side. * * Returns true if an invalidation collided with this critical section, and * the caller should retry. */ static inline bool mmu_interval_read_retry(struct mmu_interval_notifier *interval_sub, unsigned long seq) { return interval_sub->invalidate_seq != seq; } /** * mmu_interval_check_retry - Test if a collision has occurred * interval_sub: The subscription * seq: The return of the matching mmu_interval_read_begin() * * This can be used in the critical section between mmu_interval_read_begin() * and mmu_interval_read_retry(). A return of true indicates an invalidation * has collided with this critical region and a future * mmu_interval_read_retry() will return true. * * False is not reliable and only suggests a collision may not have * occurred. It can be called many times and does not have to hold the user * provided lock. * * This call can be used as part of loops and other expensive operations to * expedite a retry. */ static inline bool mmu_interval_check_retry(struct mmu_interval_notifier *interval_sub, unsigned long seq) { /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */ return READ_ONCE(interval_sub->invalidate_seq) != seq; } extern void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end); extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); extern bool mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range); static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { return (range->flags & MMU_NOTIFIER_RANGE_BLOCKABLE); } static inline void mmu_notifier_release(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_release(mm); } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_flush_young(mm, start, end); return 0; } static inline int mmu_notifier_clear_young(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) return __mmu_notifier_clear_young(mm, start, end); return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { if (mm_has_notifiers(mm)) return __mmu_notifier_test_young(mm, address); return 0; } static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { if (mm_has_notifiers(mm)) __mmu_notifier_change_pte(mm, address, pte); } static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { might_sleep(); lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags |= MMU_NOTIFIER_RANGE_BLOCKABLE; __mmu_notifier_invalidate_range_start(range); } lock_map_release(&__mmu_notifier_invalidate_range_start_map); } static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { int ret = 0; lock_map_acquire(&__mmu_notifier_invalidate_range_start_map); if (mm_has_notifiers(range->mm)) { range->flags &= ~MMU_NOTIFIER_RANGE_BLOCKABLE; ret = __mmu_notifier_invalidate_range_start(range); } lock_map_release(&__mmu_notifier_invalidate_range_start_map); return ret; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { if (mmu_notifier_range_blockable(range)) might_sleep(); if (mm_has_notifiers(range->mm)) __mmu_notifier_invalidate_range_end(range, false); } static inline void mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) { if (mm_has_notifiers(range->mm)) __mmu_notifier_invalidate_range_end(range, true); } static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) __mmu_notifier_invalidate_range(mm, start, end); } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { mm->notifier_subscriptions = NULL; } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { if (mm_has_notifiers(mm)) __mmu_notifier_subscriptions_destroy(mm); } static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned flags, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end) { range->vma = vma; range->event = event; range->mm = mm; range->start = start; range->end = end; range->flags = flags; } static inline void mmu_notifier_range_init_owner( struct mmu_notifier_range *range, enum mmu_notifier_event event, unsigned int flags, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, void *owner) { mmu_notifier_range_init(range, event, flags, vma, mm, start, end); range->owner = owner; } #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PAGE_SIZE); \ __young; \ }) #define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ ___address, \ ___address + \ PMD_SIZE); \ __young; \ }) #define ptep_clear_young_notify(__vma, __address, __ptep) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PAGE_SIZE); \ __young; \ }) #define pmdp_clear_young_notify(__vma, __address, __pmdp) \ ({ \ int __young; \ struct vm_area_struct *___vma = __vma; \ unsigned long ___address = __address; \ __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ ___address + PMD_SIZE); \ __young; \ }) #define ptep_clear_flush_notify(__vma, __address, __ptep) \ ({ \ unsigned long ___addr = __address & PAGE_MASK; \ struct mm_struct *___mm = (__vma)->vm_mm; \ pte_t ___pte; \ \ ___pte = ptep_clear_flush(__vma, __address, __ptep); \ mmu_notifier_invalidate_range(___mm, ___addr, \ ___addr + PAGE_SIZE); \ \ ___pte; \ }) #define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ struct mm_struct *___mm = (__vma)->vm_mm; \ pmd_t ___pmd; \ \ ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ mmu_notifier_invalidate_range(___mm, ___haddr, \ ___haddr + HPAGE_PMD_SIZE); \ \ ___pmd; \ }) #define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \ ({ \ unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \ struct mm_struct *___mm = (__vma)->vm_mm; \ pud_t ___pud; \ \ ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \ mmu_notifier_invalidate_range(___mm, ___haddr, \ ___haddr + HPAGE_PUD_SIZE); \ \ ___pud; \ }) /* * set_pte_at_notify() sets the pte _after_ running the notifier. * This is safe to start by updating the secondary MMUs, because the primary MMU * pte invalidate must have already happened with a ptep_clear_flush() before * set_pte_at_notify() has been invoked. Updating the secondary MMUs first is * required when we change both the protection of the mapping from read-only to * read-write and the pfn (like during copy on write page faults). Otherwise the * old page would remain mapped readonly in the secondary MMUs after the new * page is already writable by some CPU through the primary MMU. */ #define set_pte_at_notify(__mm, __address, __ptep, __pte) \ ({ \ struct mm_struct *___mm = __mm; \ unsigned long ___address = __address; \ pte_t ___pte = __pte; \ \ mmu_notifier_change_pte(___mm, ___address, ___pte); \ set_pte_at(___mm, ___address, __ptep, ___pte); \ }) #else /* CONFIG_MMU_NOTIFIER */ struct mmu_notifier_range { unsigned long start; unsigned long end; }; static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, unsigned long start, unsigned long end) { range->start = start; range->end = end; } #define mmu_notifier_range_init(range,event,flags,vma,mm,start,end) \ _mmu_notifier_range_init(range, start, end) #define mmu_notifier_range_init_owner(range, event, flags, vma, mm, start, \ end, owner) \ _mmu_notifier_range_init(range, start, end) static inline bool mmu_notifier_range_blockable(const struct mmu_notifier_range *range) { return true; } static inline int mm_has_notifiers(struct mm_struct *mm) { return 0; } static inline void mmu_notifier_release(struct mm_struct *mm) { } static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long start, unsigned long end) { return 0; } static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { return 0; } static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { } static inline void mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) { } static inline int mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) { return 0; } static inline void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) { } static inline void mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) { } static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { } static inline void mmu_notifier_subscriptions_init(struct mm_struct *mm) { } static inline void mmu_notifier_subscriptions_destroy(struct mm_struct *mm) { } #define mmu_notifier_range_update_to_read_only(r) false #define ptep_clear_flush_young_notify ptep_clear_flush_young #define pmdp_clear_flush_young_notify pmdp_clear_flush_young #define ptep_clear_young_notify ptep_test_and_clear_young #define pmdp_clear_young_notify pmdp_test_and_clear_young #define ptep_clear_flush_notify ptep_clear_flush #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush #define pudp_huge_clear_flush_notify pudp_huge_clear_flush #define set_pte_at_notify set_pte_at static inline void mmu_notifier_synchronize(void) { } #endif /* CONFIG_MMU_NOTIFIER */ #endif /* _LINUX_MMU_NOTIFIER_H */
306 17493 17744 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM context_tracking #if !defined(_TRACE_CONTEXT_TRACKING_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CONTEXT_TRACKING_H #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(context_tracking_user, TP_PROTO(int dummy), TP_ARGS(dummy), TP_STRUCT__entry( __field( int, dummy ) ), TP_fast_assign( __entry->dummy = dummy; ), TP_printk("%s", "") ); /** * user_enter - called when the kernel resumes to userspace * @dummy: dummy arg to make trace event macro happy * * This event occurs when the kernel resumes to userspace after * an exception or a syscall. */ DEFINE_EVENT(context_tracking_user, user_enter, TP_PROTO(int dummy), TP_ARGS(dummy) ); /** * user_exit - called when userspace enters the kernel * @dummy: dummy arg to make trace event macro happy * * This event occurs when userspace enters the kernel through * an exception or a syscall. */ DEFINE_EVENT(context_tracking_user, user_exit, TP_PROTO(int dummy), TP_ARGS(dummy) ); #endif /* _TRACE_CONTEXT_TRACKING_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
923 926 925 925 47 47 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 // SPDX-License-Identifier: GPL-2.0-only /* Cluster IP hashmark target * (C) 2003-2004 by Harald Welte <laforge@netfilter.org> * based on ideas of Fabio Olive Leite <olive@unixforge.org> * * Development of this code funded by SuSE Linux AG, https://www.suse.com/ */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/jhash.h> #include <linux/bitops.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/ip.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/icmp.h> #include <linux/if_arp.h> #include <linux/seq_file.h> #include <linux/refcount.h> #include <linux/netfilter_arp.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv4/ip_tables.h> #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h> #include <net/netfilter/nf_conntrack.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/checksum.h> #include <net/ip.h> #define CLUSTERIP_VERSION "0.8" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_DESCRIPTION("Xtables: CLUSTERIP target"); struct clusterip_config { struct list_head list; /* list of all configs */ refcount_t refcount; /* reference count */ refcount_t entries; /* number of entries/rules * referencing us */ __be32 clusterip; /* the IP address */ u_int8_t clustermac[ETH_ALEN]; /* the MAC address */ int ifindex; /* device ifindex */ u_int16_t num_total_nodes; /* total number of nodes */ unsigned long local_nodes; /* node number array */ #ifdef CONFIG_PROC_FS struct proc_dir_entry *pde; /* proc dir entry */ #endif enum clusterip_hashmode hash_mode; /* which hashing mode */ u_int32_t hash_initval; /* hash initialization */ struct rcu_head rcu; /* for call_rcu */ struct net *net; /* netns for pernet list */ char ifname[IFNAMSIZ]; /* device ifname */ }; #ifdef CONFIG_PROC_FS static const struct proc_ops clusterip_proc_ops; #endif struct clusterip_net { struct list_head configs; /* lock protects the configs list */ spinlock_t lock; bool clusterip_deprecated_warning; #ifdef CONFIG_PROC_FS struct proc_dir_entry *procdir; /* mutex protects the config->pde*/ struct mutex mutex; #endif unsigned int hook_users; }; static unsigned int clusterip_arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); static const struct nf_hook_ops cip_arp_ops = { .hook = clusterip_arp_mangle, .pf = NFPROTO_ARP, .hooknum = NF_ARP_OUT, .priority = -1 }; static unsigned int clusterip_net_id __read_mostly; static inline struct clusterip_net *clusterip_pernet(struct net *net) { return net_generic(net, clusterip_net_id); } static inline void clusterip_config_get(struct clusterip_config *c) { refcount_inc(&c->refcount); } static void clusterip_config_rcu_free(struct rcu_head *head) { struct clusterip_config *config; struct net_device *dev; config = container_of(head, struct clusterip_config, rcu); dev = dev_get_by_name(config->net, config->ifname); if (dev) { dev_mc_del(dev, config->clustermac); dev_put(dev); } kfree(config); } static inline void clusterip_config_put(struct clusterip_config *c) { if (refcount_dec_and_test(&c->refcount)) call_rcu(&c->rcu, clusterip_config_rcu_free); } /* decrease the count of entries using/referencing this config. If last * entry(rule) is removed, remove the config from lists, but don't free it * yet, since proc-files could still be holding references */ static inline void clusterip_config_entry_put(struct clusterip_config *c) { struct clusterip_net *cn = clusterip_pernet(c->net); local_bh_disable(); if (refcount_dec_and_lock(&c->entries, &cn->lock)) { list_del_rcu(&c->list); spin_unlock(&cn->lock); local_bh_enable(); /* In case anyone still accesses the file, the open/close * functions are also incrementing the refcount on their own, * so it's safe to remove the entry even if it's in use. */ #ifdef CONFIG_PROC_FS mutex_lock(&cn->mutex); if (cn->procdir) proc_remove(c->pde); mutex_unlock(&cn->mutex); #endif return; } local_bh_enable(); } static struct clusterip_config * __clusterip_config_find(struct net *net, __be32 clusterip) { struct clusterip_config *c; struct clusterip_net *cn = clusterip_pernet(net); list_for_each_entry_rcu(c, &cn->configs, list) { if (c->clusterip == clusterip) return c; } return NULL; } static inline struct clusterip_config * clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) { struct clusterip_config *c; rcu_read_lock_bh(); c = __clusterip_config_find(net, clusterip); if (c) { #ifdef CONFIG_PROC_FS if (!c->pde) c = NULL; else #endif if (unlikely(!refcount_inc_not_zero(&c->refcount))) c = NULL; else if (entry) { if (unlikely(!refcount_inc_not_zero(&c->entries))) { clusterip_config_put(c); c = NULL; } } } rcu_read_unlock_bh(); return c; } static void clusterip_config_init_nodelist(struct clusterip_config *c, const struct ipt_clusterip_tgt_info *i) { int n; for (n = 0; n < i->num_local_nodes; n++) set_bit(i->local_nodes[n] - 1, &c->local_nodes); } static int clusterip_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct clusterip_net *cn = clusterip_pernet(net); struct clusterip_config *c; spin_lock_bh(&cn->lock); list_for_each_entry_rcu(c, &cn->configs, list) { switch (event) { case NETDEV_REGISTER: if (!strcmp(dev->name, c->ifname)) { c->ifindex = dev->ifindex; dev_mc_add(dev, c->clustermac); } break; case NETDEV_UNREGISTER: if (dev->ifindex == c->ifindex) { dev_mc_del(dev, c->clustermac); c->ifindex = -1; } break; case NETDEV_CHANGENAME: if (!strcmp(dev->name, c->ifname)) { c->ifindex = dev->ifindex; dev_mc_add(dev, c->clustermac); } else if (dev->ifindex == c->ifindex) { dev_mc_del(dev, c->clustermac); c->ifindex = -1; } break; } } spin_unlock_bh(&cn->lock); return NOTIFY_DONE; } static struct clusterip_config * clusterip_config_init(struct net *net, const struct ipt_clusterip_tgt_info *i, __be32 ip, const char *iniface) { struct clusterip_net *cn = clusterip_pernet(net); struct clusterip_config *c; struct net_device *dev; int err; if (iniface[0] == '\0') { pr_info("Please specify an interface name\n"); return ERR_PTR(-EINVAL); } c = kzalloc(sizeof(*c), GFP_ATOMIC); if (!c) return ERR_PTR(-ENOMEM); dev = dev_get_by_name(net, iniface); if (!dev) { pr_info("no such interface %s\n", iniface); kfree(c); return ERR_PTR(-ENOENT); } c->ifindex = dev->ifindex; strcpy(c->ifname, dev->name); memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); dev_mc_add(dev, c->clustermac); dev_put(dev); c->clusterip = ip; c->num_total_nodes = i->num_total_nodes; clusterip_config_init_nodelist(c, i); c->hash_mode = i->hash_mode; c->hash_initval = i->hash_initval; c->net = net; refcount_set(&c->refcount, 1); spin_lock_bh(&cn->lock); if (__clusterip_config_find(net, ip)) { err = -EBUSY; goto out_config_put; } list_add_rcu(&c->list, &cn->configs); spin_unlock_bh(&cn->lock); #ifdef CONFIG_PROC_FS { char buffer[16]; /* create proc dir entry */ sprintf(buffer, "%pI4", &ip); mutex_lock(&cn->mutex); c->pde = proc_create_data(buffer, 0600, cn->procdir, &clusterip_proc_ops, c); mutex_unlock(&cn->mutex); if (!c->pde) { err = -ENOMEM; goto err; } } #endif refcount_set(&c->entries, 1); return c; #ifdef CONFIG_PROC_FS err: #endif spin_lock_bh(&cn->lock); list_del_rcu(&c->list); out_config_put: spin_unlock_bh(&cn->lock); clusterip_config_put(c); return ERR_PTR(err); } #ifdef CONFIG_PROC_FS static int clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum) { if (nodenum == 0 || nodenum > c->num_total_nodes) return 1; /* check if we already have this number in our bitfield */ if (test_and_set_bit(nodenum - 1, &c->local_nodes)) return 1; return 0; } static bool clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) { if (nodenum == 0 || nodenum > c->num_total_nodes) return true; if (test_and_clear_bit(nodenum - 1, &c->local_nodes)) return false; return true; } #endif static inline u_int32_t clusterip_hashfn(const struct sk_buff *skb, const struct clusterip_config *config) { const struct iphdr *iph = ip_hdr(skb); unsigned long hashval; u_int16_t sport = 0, dport = 0; int poff; poff = proto_ports_offset(iph->protocol); if (poff >= 0) { const u_int16_t *ports; u16 _ports[2]; ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); if (ports) { sport = ports[0]; dport = ports[1]; } } else { net_info_ratelimited("unknown protocol %u\n", iph->protocol); } switch (config->hash_mode) { case CLUSTERIP_HASHMODE_SIP: hashval = jhash_1word(ntohl(iph->saddr), config->hash_initval); break; case CLUSTERIP_HASHMODE_SIP_SPT: hashval = jhash_2words(ntohl(iph->saddr), sport, config->hash_initval); break; case CLUSTERIP_HASHMODE_SIP_SPT_DPT: hashval = jhash_3words(ntohl(iph->saddr), sport, dport, config->hash_initval); break; default: /* to make gcc happy */ hashval = 0; /* This cannot happen, unless the check function wasn't called * at rule load time */ pr_info("unknown mode %u\n", config->hash_mode); BUG(); break; } /* node numbers are 1..n, not 0..n */ return reciprocal_scale(hashval, config->num_total_nodes) + 1; } static inline int clusterip_responsible(const struct clusterip_config *config, u_int32_t hash) { return test_bit(hash - 1, &config->local_nodes); } /*********************************************************************** * IPTABLES TARGET ***********************************************************************/ static unsigned int clusterip_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; struct nf_conn *ct; enum ip_conntrack_info ctinfo; u_int32_t hash; /* don't need to clusterip_config_get() here, since refcount * is only decremented by destroy() - and ip_tables guarantees * that the ->target() function isn't called after ->destroy() */ ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return NF_DROP; /* special case: ICMP error handling. conntrack distinguishes between * error messages (RELATED) and information requests (see below) */ if (ip_hdr(skb)->protocol == IPPROTO_ICMP && (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY)) return XT_CONTINUE; /* nf_conntrack_proto_icmp guarantees us that we only have ICMP_ECHO, * TIMESTAMP, INFO_REQUEST or ICMP_ADDRESS type icmp packets from here * on, which all have an ID field [relevant for hashing]. */ hash = clusterip_hashfn(skb, cipinfo->config); switch (ctinfo) { case IP_CT_NEW: WRITE_ONCE(ct->mark, hash); break; case IP_CT_RELATED: case IP_CT_RELATED_REPLY: /* FIXME: we don't handle expectations at the moment. * They can arrive on a different node than * the master connection (e.g. FTP passive mode) */ case IP_CT_ESTABLISHED: case IP_CT_ESTABLISHED_REPLY: break; default: /* Prevent gcc warnings */ break; } #ifdef DEBUG nf_ct_dump_tuple_ip(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); #endif pr_debug("hash=%u ct_hash=%u ", hash, READ_ONCE(ct->mark)); if (!clusterip_responsible(cipinfo->config, hash)) { pr_debug("not responsible\n"); return NF_DROP; } pr_debug("responsible\n"); /* despite being received via linklayer multicast, this is * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */ skb->pkt_type = PACKET_HOST; return XT_CONTINUE; } static int clusterip_tg_check(const struct xt_tgchk_param *par) { struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; struct clusterip_net *cn = clusterip_pernet(par->net); const struct ipt_entry *e = par->entryinfo; struct clusterip_config *config; int ret, i; if (par->nft_compat) { pr_err("cannot use CLUSTERIP target from nftables compat\n"); return -EOPNOTSUPP; } if (cn->hook_users == UINT_MAX) return -EOVERFLOW; if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { pr_info("unknown mode %u\n", cipinfo->hash_mode); return -EINVAL; } if (e->ip.dmsk.s_addr != htonl(0xffffffff) || e->ip.dst.s_addr == 0) { pr_info("Please specify destination IP\n"); return -EINVAL; } if (cipinfo->num_local_nodes > ARRAY_SIZE(cipinfo->local_nodes)) { pr_info("bad num_local_nodes %u\n", cipinfo->num_local_nodes); return -EINVAL; } for (i = 0; i < cipinfo->num_local_nodes; i++) { if (cipinfo->local_nodes[i] - 1 >= sizeof(config->local_nodes) * 8) { pr_info("bad local_nodes[%d] %u\n", i, cipinfo->local_nodes[i]); return -EINVAL; } } config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1); if (!config) { if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { pr_info("no config found for %pI4, need 'new'\n", &e->ip.dst.s_addr); return -EINVAL; } else { config = clusterip_config_init(par->net, cipinfo, e->ip.dst.s_addr, e->ip.iniface); if (IS_ERR(config)) return PTR_ERR(config); } } else if (memcmp(&config->clustermac, &cipinfo->clustermac, ETH_ALEN)) { clusterip_config_entry_put(config); clusterip_config_put(config); return -EINVAL; } ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) { pr_info("cannot load conntrack support for proto=%u\n", par->family); clusterip_config_entry_put(config); clusterip_config_put(config); return ret; } if (cn->hook_users == 0) { ret = nf_register_net_hook(par->net, &cip_arp_ops); if (ret < 0) { clusterip_config_entry_put(config); clusterip_config_put(config); nf_ct_netns_put(par->net, par->family); return ret; } } cn->hook_users++; if (!cn->clusterip_deprecated_warning) { pr_info("ipt_CLUSTERIP is deprecated and it will removed soon, " "use xt_cluster instead\n"); cn->clusterip_deprecated_warning = true; } cipinfo->config = config; return ret; } /* drop reference count of cluster config when rule is deleted */ static void clusterip_tg_destroy(const struct xt_tgdtor_param *par) { const struct ipt_clusterip_tgt_info *cipinfo = par->targinfo; struct clusterip_net *cn = clusterip_pernet(par->net); /* if no more entries are referencing the config, remove it * from the list and destroy the proc entry */ clusterip_config_entry_put(cipinfo->config); clusterip_config_put(cipinfo->config); nf_ct_netns_put(par->net, par->family); cn->hook_users--; if (cn->hook_users == 0) nf_unregister_net_hook(par->net, &cip_arp_ops); } #ifdef CONFIG_NETFILTER_XTABLES_COMPAT struct compat_ipt_clusterip_tgt_info { u_int32_t flags; u_int8_t clustermac[6]; u_int16_t num_total_nodes; u_int16_t num_local_nodes; u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; u_int32_t hash_mode; u_int32_t hash_initval; compat_uptr_t config; }; #endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ static struct xt_target clusterip_tg_reg __read_mostly = { .name = "CLUSTERIP", .family = NFPROTO_IPV4, .target = clusterip_tg, .checkentry = clusterip_tg_check, .destroy = clusterip_tg_destroy, .targetsize = sizeof(struct ipt_clusterip_tgt_info), .usersize = offsetof(struct ipt_clusterip_tgt_info, config), #ifdef CONFIG_NETFILTER_XTABLES_COMPAT .compatsize = sizeof(struct compat_ipt_clusterip_tgt_info), #endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ .me = THIS_MODULE }; /*********************************************************************** * ARP MANGLING CODE ***********************************************************************/ /* hardcoded for 48bit ethernet and 32bit ipv4 addresses */ struct arp_payload { u_int8_t src_hw[ETH_ALEN]; __be32 src_ip; u_int8_t dst_hw[ETH_ALEN]; __be32 dst_ip; } __packed; #ifdef DEBUG static void arp_print(struct arp_payload *payload) { #define HBUFFERLEN 30 char hbuffer[HBUFFERLEN]; int j, k; for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < ETH_ALEN; j++) { hbuffer[k++] = hex_asc_hi(payload->src_hw[j]); hbuffer[k++] = hex_asc_lo(payload->src_hw[j]); hbuffer[k++] = ':'; } hbuffer[--k] = '\0'; pr_debug("src %pI4@%s, dst %pI4\n", &payload->src_ip, hbuffer, &payload->dst_ip); } #endif static unsigned int clusterip_arp_mangle(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct arphdr *arp = arp_hdr(skb); struct arp_payload *payload; struct clusterip_config *c; struct net *net = state->net; /* we don't care about non-ethernet and non-ipv4 ARP */ if (arp->ar_hrd != htons(ARPHRD_ETHER) || arp->ar_pro != htons(ETH_P_IP) || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) return NF_ACCEPT; /* we only want to mangle arp requests and replies */ if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) return NF_ACCEPT; payload = (void *)(arp+1); /* if there is no clusterip configuration for the arp reply's * source ip, we don't want to mangle it */ c = clusterip_config_find_get(net, payload->src_ip, 0); if (!c) return NF_ACCEPT; /* normally the linux kernel always replies to arp queries of * addresses on different interfacs. However, in the CLUSTERIP case * this wouldn't work, since we didn't subscribe the mcast group on * other interfaces */ if (c->ifindex != state->out->ifindex) { pr_debug("not mangling arp reply on different interface: cip'%d'-skb'%d'\n", c->ifindex, state->out->ifindex); clusterip_config_put(c); return NF_ACCEPT; } /* mangle reply hardware address */ memcpy(payload->src_hw, c->clustermac, arp->ar_hln); #ifdef DEBUG pr_debug("mangled arp reply: "); arp_print(payload); #endif clusterip_config_put(c); return NF_ACCEPT; } /*********************************************************************** * PROC DIR HANDLING ***********************************************************************/ #ifdef CONFIG_PROC_FS struct clusterip_seq_position { unsigned int pos; /* position */ unsigned int weight; /* number of bits set == size */ unsigned int bit; /* current bit */ unsigned long val; /* current value */ }; static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) { struct clusterip_config *c = s->private; unsigned int weight; u_int32_t local_nodes; struct clusterip_seq_position *idx; /* FIXME: possible race */ local_nodes = c->local_nodes; weight = hweight32(local_nodes); if (*pos >= weight) return NULL; idx = kmalloc(sizeof(struct clusterip_seq_position), GFP_KERNEL); if (!idx) return ERR_PTR(-ENOMEM); idx->pos = *pos; idx->weight = weight; idx->bit = ffs(local_nodes); idx->val = local_nodes; clear_bit(idx->bit - 1, &idx->val); return idx; } static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos) { struct clusterip_seq_position *idx = v; *pos = ++idx->pos; if (*pos >= idx->weight) { kfree(v); return NULL; } idx->bit = ffs(idx->val); clear_bit(idx->bit - 1, &idx->val); return idx; } static void clusterip_seq_stop(struct seq_file *s, void *v) { if (!IS_ERR(v)) kfree(v); } static int clusterip_seq_show(struct seq_file *s, void *v) { struct clusterip_seq_position *idx = v; if (idx->pos != 0) seq_putc(s, ','); seq_printf(s, "%u", idx->bit); if (idx->pos == idx->weight - 1) seq_putc(s, '\n'); return 0; } static const struct seq_operations clusterip_seq_ops = { .start = clusterip_seq_start, .next = clusterip_seq_next, .stop = clusterip_seq_stop, .show = clusterip_seq_show, }; static int clusterip_proc_open(struct inode *inode, struct file *file) { int ret = seq_open(file, &clusterip_seq_ops); if (!ret) { struct seq_file *sf = file->private_data; struct clusterip_config *c = PDE_DATA(inode); sf->private = c; clusterip_config_get(c); } return ret; } static int clusterip_proc_release(struct inode *inode, struct file *file) { struct clusterip_config *c = PDE_DATA(inode); int ret; ret = seq_release(inode, file); if (!ret) clusterip_config_put(c); return ret; } static ssize_t clusterip_proc_write(struct file *file, const char __user *input, size_t size, loff_t *ofs) { struct clusterip_config *c = PDE_DATA(file_inode(file)); #define PROC_WRITELEN 10 char buffer[PROC_WRITELEN+1]; unsigned long nodenum; int rc; if (size > PROC_WRITELEN) return -EIO; if (copy_from_user(buffer, input, size)) return -EFAULT; buffer[size] = 0; if (*buffer == '+') { rc = kstrtoul(buffer+1, 10, &nodenum); if (rc) return rc; if (clusterip_add_node(c, nodenum)) return -ENOMEM; } else if (*buffer == '-') { rc = kstrtoul(buffer+1, 10, &nodenum); if (rc) return rc; if (clusterip_del_node(c, nodenum)) return -ENOENT; } else return -EIO; return size; } static const struct proc_ops clusterip_proc_ops = { .proc_open = clusterip_proc_open, .proc_read = seq_read, .proc_write = clusterip_proc_write, .proc_lseek = seq_lseek, .proc_release = clusterip_proc_release, }; #endif /* CONFIG_PROC_FS */ static int clusterip_net_init(struct net *net) { struct clusterip_net *cn = clusterip_pernet(net); INIT_LIST_HEAD(&cn->configs); spin_lock_init(&cn->lock); #ifdef CONFIG_PROC_FS cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net); if (!cn->procdir) { pr_err("Unable to proc dir entry\n"); return -ENOMEM; } mutex_init(&cn->mutex); #endif /* CONFIG_PROC_FS */ return 0; } static void clusterip_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS struct clusterip_net *cn = clusterip_pernet(net); mutex_lock(&cn->mutex); proc_remove(cn->procdir); cn->procdir = NULL; mutex_unlock(&cn->mutex); #endif } static struct pernet_operations clusterip_net_ops = { .init = clusterip_net_init, .exit = clusterip_net_exit, .id = &clusterip_net_id, .size = sizeof(struct clusterip_net), }; static struct notifier_block cip_netdev_notifier = { .notifier_call = clusterip_netdev_event }; static int __init clusterip_tg_init(void) { int ret; ret = register_pernet_subsys(&clusterip_net_ops); if (ret < 0) return ret; ret = xt_register_target(&clusterip_tg_reg); if (ret < 0) goto cleanup_subsys; ret = register_netdevice_notifier(&cip_netdev_notifier); if (ret < 0) goto unregister_target; pr_info("ClusterIP Version %s loaded successfully\n", CLUSTERIP_VERSION); return 0; unregister_target: xt_unregister_target(&clusterip_tg_reg); cleanup_subsys: unregister_pernet_subsys(&clusterip_net_ops); return ret; } static void __exit clusterip_tg_exit(void) { pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION); unregister_netdevice_notifier(&cip_netdev_notifier); xt_unregister_target(&clusterip_tg_reg); unregister_pernet_subsys(&clusterip_net_ops); /* Wait for completion of call_rcu()'s (clusterip_config_rcu_free) */ rcu_barrier(); } module_init(clusterip_tg_init); module_exit(clusterip_tg_exit);
23 23 17 17 23 39 39 9 9 9 25 15 8 181 127 56 9 56 73 174 25 181 73 43 30 180 74 28 153 183 184 184 184 184 181 184 181 25 183 173 174 174 67 33 74 181 23 23 25 25 25 23 62 175 173 173 174 17 17 17 71 23 73 173 39 9 25 25 2 47 25 153 153 25 25 184 25 184 181 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * * This file is part of the SCTP kernel implementation * * These functions work with the state functions in sctp_sm_statefuns.c * to implement that state operations. These functions implement the * steps which require modifying existing data structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Jon Grimm <jgrimm@austin.ibm.com> * Hui Huang <hui.huang@nokia.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/skbuff.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/ip.h> #include <linux/gfp.h> #include <net/sock.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> static int sctp_cmd_interpreter(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp); static int sctp_side_effects(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association **asoc, void *event_arg, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp); /******************************************************************** * Helper functions ********************************************************************/ /* A helper function for delayed processing of INET ECN CE bit. */ static void sctp_do_ecn_ce_work(struct sctp_association *asoc, __u32 lowest_tsn) { /* Save the TSN away for comparison when we receive CWR */ asoc->last_ecne_tsn = lowest_tsn; asoc->need_ecne = 1; } /* Helper function for delayed processing of SCTP ECNE chunk. */ /* RFC 2960 Appendix A * * RFC 2481 details a specific bit for a sender to send in * the header of its next outbound TCP segment to indicate to * its peer that it has reduced its congestion window. This * is termed the CWR bit. For SCTP the same indication is made * by including the CWR chunk. This chunk contains one data * element, i.e. the TSN number that was sent in the ECNE chunk. * This element represents the lowest TSN number in the datagram * that was originally marked with the CE bit. */ static struct sctp_chunk *sctp_do_ecn_ecne_work(struct sctp_association *asoc, __u32 lowest_tsn, struct sctp_chunk *chunk) { struct sctp_chunk *repl; /* Our previously transmitted packet ran into some congestion * so we should take action by reducing cwnd and ssthresh * and then ACK our peer that we we've done so by * sending a CWR. */ /* First, try to determine if we want to actually lower * our cwnd variables. Only lower them if the ECNE looks more * recent than the last response. */ if (TSN_lt(asoc->last_cwr_tsn, lowest_tsn)) { struct sctp_transport *transport; /* Find which transport's congestion variables * need to be adjusted. */ transport = sctp_assoc_lookup_tsn(asoc, lowest_tsn); /* Update the congestion variables. */ if (transport) sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_ECNE); asoc->last_cwr_tsn = lowest_tsn; } /* Always try to quiet the other end. In case of lost CWR, * resend last_cwr_tsn. */ repl = sctp_make_cwr(asoc, asoc->last_cwr_tsn, chunk); /* If we run out of memory, it will look like a lost CWR. We'll * get back in sync eventually. */ return repl; } /* Helper function to do delayed processing of ECN CWR chunk. */ static void sctp_do_ecn_cwr_work(struct sctp_association *asoc, __u32 lowest_tsn) { /* Turn off ECNE getting auto-prepended to every outgoing * packet */ asoc->need_ecne = 0; } /* Generate SACK if necessary. We call this at the end of a packet. */ static int sctp_gen_sack(struct sctp_association *asoc, int force, struct sctp_cmd_seq *commands) { struct sctp_transport *trans = asoc->peer.last_data_from; __u32 ctsn, max_tsn_seen; struct sctp_chunk *sack; int error = 0; if (force || (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) || (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE))) asoc->peer.sack_needed = 1; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); /* From 12.2 Parameters necessary per association (i.e. the TCB): * * Ack State : This flag indicates if the next received packet * : is to be responded to with a SACK. ... * : When DATA chunks are out of order, SACK's * : are not delayed (see Section 6). * * [This is actually not mentioned in Section 6, but we * implement it here anyway. --piggy] */ if (max_tsn_seen != ctsn) asoc->peer.sack_needed = 1; /* From 6.2 Acknowledgement on Reception of DATA Chunks: * * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, * an acknowledgement SHOULD be generated for at least every * second packet (not every second DATA chunk) received, and * SHOULD be generated within 200 ms of the arrival of any * unacknowledged DATA chunk. ... */ if (!asoc->peer.sack_needed) { asoc->peer.sack_cnt++; /* Set the SACK delay timeout based on the * SACK delay for the last transport * data was received from, or the default * for the association. */ if (trans) { /* We will need a SACK for the next packet. */ if (asoc->peer.sack_cnt >= trans->sackfreq - 1) asoc->peer.sack_needed = 1; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = trans->sackdelay; } else { /* We will need a SACK for the next packet. */ if (asoc->peer.sack_cnt >= asoc->sackfreq - 1) asoc->peer.sack_needed = 1; asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; } /* Restart the SACK timer. */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } else { __u32 old_a_rwnd = asoc->a_rwnd; asoc->a_rwnd = asoc->rwnd; sack = sctp_make_sack(asoc); if (!sack) { asoc->a_rwnd = old_a_rwnd; goto nomem; } asoc->peer.sack_needed = 0; asoc->peer.sack_cnt = 0; sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(sack)); /* Stop the SACK timer. */ sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } return error; nomem: error = -ENOMEM; return error; } /* When the T3-RTX timer expires, it calls this function to create the * relevant state machine event. */ void sctp_generate_t3_rtx_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, T3_rtx_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); int error; /* Check whether a task is in the sock. */ bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->T3_rtx_timer, jiffies + (HZ/20))) sctp_transport_hold(transport); goto out_unlock; } /* Run through the state machine. */ error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_T3_RTX), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* This is a sa interface for producing timeout events. It works * for timeouts which use the association as their parameter. */ static void sctp_generate_timeout_event(struct sctp_association *asoc, enum sctp_event_timeout timeout_type) { struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); int error = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy: timer %d\n", __func__, timeout_type); /* Try again later. */ if (!mod_timer(&asoc->timers[timeout_type], jiffies + (HZ/20))) sctp_association_hold(asoc); goto out_unlock; } /* Is this association really dead and just waiting around for * the timer to let go of the reference? */ if (asoc->base.dead) goto out_unlock; /* Run through the state machine. */ error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(timeout_type), asoc->state, asoc->ep, asoc, (void *)timeout_type, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_association_put(asoc); } static void sctp_generate_t1_cookie_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_COOKIE]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE); } static void sctp_generate_t1_init_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T1_INIT]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_INIT); } static void sctp_generate_t2_shutdown_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T2_SHUTDOWN); } static void sctp_generate_t4_rto_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T4_RTO]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T4_RTO); } static void sctp_generate_t5_shutdown_guard_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD); } /* sctp_generate_t5_shutdown_guard_event() */ static void sctp_generate_autoclose_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_AUTOCLOSE); } /* Generate a heart beat event. If the sock is busy, reschedule. Make * sure that the transport is still valid. */ void sctp_generate_heartbeat_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, hb_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); u32 elapsed, timeout; int error = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->hb_timer, jiffies + (HZ/20))) sctp_transport_hold(transport); goto out_unlock; } /* Check if we should still send the heartbeat or reschedule */ elapsed = jiffies - transport->last_time_sent; timeout = sctp_transport_timeout(transport); if (elapsed < timeout) { elapsed = timeout - elapsed; if (!mod_timer(&transport->hb_timer, jiffies + elapsed)) sctp_transport_hold(transport); goto out_unlock; } error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* Handle the timeout of the ICMP protocol unreachable timer. Trigger * the correct state machine transition that will close the association. */ void sctp_generate_proto_unreach_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, proto_unreach_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->proto_unreach_timer, jiffies + (HZ/20))) sctp_transport_hold(transport); goto out_unlock; } /* Is this structure just waiting around for us to actually * get destroyed? */ if (asoc->base.dead) goto out_unlock; sctp_do_sm(net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* Handle the timeout of the RE-CONFIG timer. */ void sctp_generate_reconf_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, reconf_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); int error = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->reconf_timer, jiffies + (HZ / 20))) sctp_transport_hold(transport); goto out_unlock; } /* This happens when the response arrives after the timer is triggered. */ if (!asoc->strreset_chunk) goto out_unlock; error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_RECONF), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* Handle the timeout of the probe timer. */ void sctp_generate_probe_event(struct timer_list *t) { struct sctp_transport *transport = from_timer(transport, t, probe_timer); struct sctp_association *asoc = transport->asoc; struct sock *sk = asoc->base.sk; struct net *net = sock_net(sk); int error = 0; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { pr_debug("%s: sock is busy\n", __func__); /* Try again later. */ if (!mod_timer(&transport->probe_timer, jiffies + (HZ / 20))) sctp_transport_hold(transport); goto out_unlock; } error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT, SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_PROBE), asoc->state, asoc->ep, asoc, transport, GFP_ATOMIC); if (error) sk->sk_err = -error; out_unlock: bh_unlock_sock(sk); sctp_transport_put(transport); } /* Inject a SACK Timeout event into the state machine. */ static void sctp_generate_sack_event(struct timer_list *t) { struct sctp_association *asoc = from_timer(asoc, t, timers[SCTP_EVENT_TIMEOUT_SACK]); sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_SACK); } sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { [SCTP_EVENT_TIMEOUT_NONE] = NULL, [SCTP_EVENT_TIMEOUT_T1_COOKIE] = sctp_generate_t1_cookie_event, [SCTP_EVENT_TIMEOUT_T1_INIT] = sctp_generate_t1_init_event, [SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = sctp_generate_t2_shutdown_event, [SCTP_EVENT_TIMEOUT_T3_RTX] = NULL, [SCTP_EVENT_TIMEOUT_T4_RTO] = sctp_generate_t4_rto_event, [SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] = sctp_generate_t5_shutdown_guard_event, [SCTP_EVENT_TIMEOUT_HEARTBEAT] = NULL, [SCTP_EVENT_TIMEOUT_RECONF] = NULL, [SCTP_EVENT_TIMEOUT_SACK] = sctp_generate_sack_event, [SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sctp_generate_autoclose_event, }; /* RFC 2960 8.2 Path Failure Detection * * When its peer endpoint is multi-homed, an endpoint should keep a * error counter for each of the destination transport addresses of the * peer endpoint. * * Each time the T3-rtx timer expires on any address, or when a * HEARTBEAT sent to an idle address is not acknowledged within a RTO, * the error counter of that destination address will be incremented. * When the value in the error counter exceeds the protocol parameter * 'Path.Max.Retrans' of that destination address, the endpoint should * mark the destination transport address as inactive, and a * notification SHOULD be sent to the upper layer. * */ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, struct sctp_association *asoc, struct sctp_transport *transport, int is_hb) { /* The check for association's overall error counter exceeding the * threshold is done in the state function. */ /* We are here due to a timer expiration. If the timer was * not a HEARTBEAT, then normal error tracking is done. * If the timer was a heartbeat, we only increment error counts * when we already have an outstanding HEARTBEAT that has not * been acknowledged. * Additionally, some tranport states inhibit error increments. */ if (!is_hb) { asoc->overall_error_count++; if (transport->state != SCTP_INACTIVE) transport->error_count++; } else if (transport->hb_sent) { if (transport->state != SCTP_UNCONFIRMED) asoc->overall_error_count++; if (transport->state != SCTP_INACTIVE) transport->error_count++; } /* If the transport error count is greater than the pf_retrans * threshold, and less than pathmaxrtx, and if the current state * is SCTP_ACTIVE, then mark this transport as Partially Failed, * see SCTP Quick Failover Draft, section 5.1 */ if (asoc->base.net->sctp.pf_enable && transport->state == SCTP_ACTIVE && transport->error_count < transport->pathmaxrxt && transport->error_count > transport->pf_retrans) { sctp_assoc_control_transport(asoc, transport, SCTP_TRANSPORT_PF, 0); /* Update the hb timer to resend a heartbeat every rto */ sctp_transport_reset_hb_timer(transport); } if (transport->state != SCTP_INACTIVE && (transport->error_count > transport->pathmaxrxt)) { pr_debug("%s: association:%p transport addr:%pISpc failed\n", __func__, asoc, &transport->ipaddr.sa); sctp_assoc_control_transport(asoc, transport, SCTP_TRANSPORT_DOWN, SCTP_FAILED_THRESHOLD); } if (transport->error_count > transport->ps_retrans && asoc->peer.primary_path == transport && asoc->peer.active_path != transport) sctp_assoc_set_primary(asoc, asoc->peer.active_path); /* E2) For the destination address for which the timer * expires, set RTO <- RTO * 2 ("back off the timer"). The * maximum value discussed in rule C7 above (RTO.max) may be * used to provide an upper bound to this doubling operation. * * Special Case: the first HB doesn't trigger exponential backoff. * The first unacknowledged HB triggers it. We do this with a flag * that indicates that we have an outstanding HB. */ if (!is_hb || transport->hb_sent) { transport->rto = min((transport->rto * 2), transport->asoc->rto_max); sctp_max_rto(asoc, transport); } } /* Worker routine to handle INIT command failure. */ static void sctp_cmd_init_failed(struct sctp_cmd_seq *commands, struct sctp_association *asoc, unsigned int error) { struct sctp_ulpevent *event; event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_CANT_STR_ASSOC, (__u16)error, 0, 0, NULL, GFP_ATOMIC); if (event) sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(event)); sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_CLOSED)); /* SEND_FAILED sent later when cleaning up the association. */ asoc->outqueue.error = error; sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); } /* Worker routine to handle SCTP_CMD_ASSOC_FAILED. */ static void sctp_cmd_assoc_failed(struct sctp_cmd_seq *commands, struct sctp_association *asoc, enum sctp_event_type event_type, union sctp_subtype subtype, struct sctp_chunk *chunk, unsigned int error) { struct sctp_ulpevent *event; struct sctp_chunk *abort; /* Cancel any partial delivery in progress. */ asoc->stream.si->abort_pd(&asoc->ulpq, GFP_ATOMIC); if (event_type == SCTP_EVENT_T_CHUNK && subtype.chunk == SCTP_CID_ABORT) event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, (__u16)error, 0, 0, chunk, GFP_ATOMIC); else event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, (__u16)error, 0, 0, NULL, GFP_ATOMIC); if (event) sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(event)); if (asoc->overall_error_count >= asoc->max_retrans) { abort = sctp_make_violation_max_retrans(asoc, chunk); if (abort) sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); } sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, SCTP_STATE(SCTP_STATE_CLOSED)); /* SEND_FAILED sent later when cleaning up the association. */ asoc->outqueue.error = error; sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); } /* Process an init chunk (may be real INIT/INIT-ACK or an embedded INIT * inside the cookie. In reality, this is only used for INIT-ACK processing * since all other cases use "temporary" associations and can do all * their work in statefuns directly. */ static int sctp_cmd_process_init(struct sctp_cmd_seq *commands, struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_init_chunk *peer_init, gfp_t gfp) { int error; /* We only process the init as a sideeffect in a single * case. This is when we process the INIT-ACK. If we * fail during INIT processing (due to malloc problems), * just return the error and stop processing the stack. */ if (!sctp_process_init(asoc, chunk, sctp_source(chunk), peer_init, gfp)) error = -ENOMEM; else error = 0; return error; } /* Helper function to break out starting up of heartbeat timers. */ static void sctp_cmd_hb_timers_start(struct sctp_cmd_seq *cmds, struct sctp_association *asoc) { struct sctp_transport *t; /* Start a heartbeat timer for each transport on the association. * hold a reference on the transport to make sure none of * the needed data structures go away. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) sctp_transport_reset_hb_timer(t); } static void sctp_cmd_hb_timers_stop(struct sctp_cmd_seq *cmds, struct sctp_association *asoc) { struct sctp_transport *t; /* Stop all heartbeat timers. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (del_timer(&t->hb_timer)) sctp_transport_put(t); } } /* Helper function to stop any pending T3-RTX timers */ static void sctp_cmd_t3_rtx_timers_stop(struct sctp_cmd_seq *cmds, struct sctp_association *asoc) { struct sctp_transport *t; list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { if (del_timer(&t->T3_rtx_timer)) sctp_transport_put(t); } } /* Helper function to handle the reception of an HEARTBEAT ACK. */ static void sctp_cmd_transport_on(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_transport *t, struct sctp_chunk *chunk) { struct sctp_sender_hb_info *hbinfo; int was_unconfirmed = 0; /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of the * HEARTBEAT should clear the error counter of the destination * transport address to which the HEARTBEAT was sent. */ t->error_count = 0; /* * Although RFC4960 specifies that the overall error count must * be cleared when a HEARTBEAT ACK is received, we make an * exception while in SHUTDOWN PENDING. If the peer keeps its * window shut forever, we may never be able to transmit our * outstanding data and rely on the retransmission limit be reached * to shutdown the association. */ if (t->asoc->state < SCTP_STATE_SHUTDOWN_PENDING) t->asoc->overall_error_count = 0; /* Clear the hb_sent flag to signal that we had a good * acknowledgement. */ t->hb_sent = 0; /* Mark the destination transport address as active if it is not so * marked. */ if ((t->state == SCTP_INACTIVE) || (t->state == SCTP_UNCONFIRMED)) { was_unconfirmed = 1; sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP, SCTP_HEARTBEAT_SUCCESS); } if (t->state == SCTP_PF) sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP, SCTP_HEARTBEAT_SUCCESS); /* HB-ACK was received for a the proper HB. Consider this * forward progress. */ if (t->dst) sctp_transport_dst_confirm(t); /* The receiver of the HEARTBEAT ACK should also perform an * RTT measurement for that destination transport address * using the time value carried in the HEARTBEAT ACK chunk. * If the transport's rto_pending variable has been cleared, * it was most likely due to a retransmit. However, we want * to re-enable it to properly update the rto. */ if (t->rto_pending == 0) t->rto_pending = 1; hbinfo = (struct sctp_sender_hb_info *)chunk->skb->data; sctp_transport_update_rto(t, (jiffies - hbinfo->sent_at)); /* Update the heartbeat timer. */ sctp_transport_reset_hb_timer(t); if (was_unconfirmed && asoc->peer.transport_count == 1) sctp_transport_immediate_rtx(t); } /* Helper function to process the process SACK command. */ static int sctp_cmd_process_sack(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { int err = 0; if (sctp_outq_sack(&asoc->outqueue, chunk)) { /* There are no more TSNs awaiting SACK. */ err = sctp_do_sm(asoc->base.net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_NO_PENDING_TSN), asoc->state, asoc->ep, asoc, NULL, GFP_ATOMIC); } return err; } /* Helper function to set the timeout value for T2-SHUTDOWN timer and to set * the transport for a shutdown chunk. */ static void sctp_cmd_setup_t2(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { struct sctp_transport *t; if (chunk->transport) t = chunk->transport; else { t = sctp_assoc_choose_alter_transport(asoc, asoc->shutdown_last_sent_to); chunk->transport = t; } asoc->shutdown_last_sent_to = t; asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto; } /* Helper function to change the state of an association. */ static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, enum sctp_state state) { struct sock *sk = asoc->base.sk; asoc->state = state; pr_debug("%s: asoc:%p[%s]\n", __func__, asoc, sctp_state_tbl[state]); if (sctp_style(sk, TCP)) { /* Change the sk->sk_state of a TCP-style socket that has * successfully completed a connect() call. */ if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED)) inet_sk_set_state(sk, SCTP_SS_ESTABLISHED); /* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */ if (sctp_state(asoc, SHUTDOWN_RECEIVED) && sctp_sstate(sk, ESTABLISHED)) { inet_sk_set_state(sk, SCTP_SS_CLOSING); sk->sk_shutdown |= RCV_SHUTDOWN; } } if (sctp_state(asoc, COOKIE_WAIT)) { /* Reset init timeouts since they may have been * increased due to timer expirations. */ asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = asoc->rto_initial; asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = asoc->rto_initial; } if (sctp_state(asoc, ESTABLISHED)) { kfree(asoc->peer.cookie); asoc->peer.cookie = NULL; } if (sctp_state(asoc, ESTABLISHED) || sctp_state(asoc, CLOSED) || sctp_state(asoc, SHUTDOWN_RECEIVED)) { /* Wake up any processes waiting in the asoc's wait queue in * sctp_wait_for_connect() or sctp_wait_for_sndbuf(). */ if (waitqueue_active(&asoc->wait)) wake_up_interruptible(&asoc->wait); /* Wake up any processes waiting in the sk's sleep queue of * a TCP-style or UDP-style peeled-off socket in * sctp_wait_for_accept() or sctp_wait_for_packet(). * For a UDP-style socket, the waiters are woken up by the * notifications. */ if (!sctp_style(sk, UDP)) sk->sk_state_change(sk); } if (sctp_state(asoc, SHUTDOWN_PENDING) && !sctp_outq_is_empty(&asoc->outqueue)) sctp_outq_uncork(&asoc->outqueue, GFP_ATOMIC); } /* Helper function to delete an association. */ static void sctp_cmd_delete_tcb(struct sctp_cmd_seq *cmds, struct sctp_association *asoc) { struct sock *sk = asoc->base.sk; /* If it is a non-temporary association belonging to a TCP-style * listening socket that is not closed, do not free it so that accept() * can pick it up later. */ if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING) && (!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK)) return; sctp_association_free(asoc); } /* * ADDIP Section 4.1 ASCONF Chunk Procedures * A4) Start a T-4 RTO timer, using the RTO value of the selected * destination address (we use active path instead of primary path just * because primary path may be inactive. */ static void sctp_cmd_setup_t4(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { struct sctp_transport *t; t = sctp_assoc_choose_alter_transport(asoc, chunk->transport); asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = t->rto; chunk->transport = t; } /* Process an incoming Operation Error Chunk. */ static void sctp_cmd_process_operr(struct sctp_cmd_seq *cmds, struct sctp_association *asoc, struct sctp_chunk *chunk) { struct sctp_errhdr *err_hdr; struct sctp_ulpevent *ev; while (chunk->chunk_end > chunk->skb->data) { err_hdr = (struct sctp_errhdr *)(chunk->skb->data); ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0, GFP_ATOMIC); if (!ev) return; asoc->stream.si->enqueue_event(&asoc->ulpq, ev); switch (err_hdr->cause) { case SCTP_ERROR_UNKNOWN_CHUNK: { struct sctp_chunkhdr *unk_chunk_hdr; unk_chunk_hdr = (struct sctp_chunkhdr *) err_hdr->variable; switch (unk_chunk_hdr->type) { /* ADDIP 4.1 A9) If the peer responds to an ASCONF with * an ERROR chunk reporting that it did not recognized * the ASCONF chunk type, the sender of the ASCONF MUST * NOT send any further ASCONF chunks and MUST stop its * T-4 timer. */ case SCTP_CID_ASCONF: if (asoc->peer.asconf_capable == 0) break; asoc->peer.asconf_capable = 0; sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); break; default: break; } break; } default: break; } } } /* Helper function to remove the association non-primary peer * transports. */ static void sctp_cmd_del_non_primary(struct sctp_association *asoc) { struct sctp_transport *t; struct list_head *temp; struct list_head *pos; list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { t = list_entry(pos, struct sctp_transport, transports); if (!sctp_cmp_addr_exact(&t->ipaddr, &asoc->peer.primary_addr)) { sctp_assoc_rm_peer(asoc, t); } } } /* Helper function to set sk_err on a 1-1 style socket. */ static void sctp_cmd_set_sk_err(struct sctp_association *asoc, int error) { struct sock *sk = asoc->base.sk; if (!sctp_style(sk, UDP)) sk->sk_err = error; } /* Helper function to generate an association change event */ static void sctp_cmd_assoc_change(struct sctp_cmd_seq *commands, struct sctp_association *asoc, u8 state) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_assoc_change(asoc, 0, state, 0, asoc->c.sinit_num_ostreams, asoc->c.sinit_max_instreams, NULL, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } static void sctp_cmd_peer_no_auth(struct sctp_cmd_seq *commands, struct sctp_association *asoc) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, 0, SCTP_AUTH_NO_AUTH, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } /* Helper function to generate an adaptation indication event */ static void sctp_cmd_adaptation_ind(struct sctp_cmd_seq *commands, struct sctp_association *asoc) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_adaptation_indication(asoc, GFP_ATOMIC); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } static void sctp_cmd_t1_timer_update(struct sctp_association *asoc, enum sctp_event_timeout timer, char *name) { struct sctp_transport *t; t = asoc->init_last_sent_to; asoc->init_err_counter++; if (t->init_sent_count > (asoc->init_cycle + 1)) { asoc->timeouts[timer] *= 2; if (asoc->timeouts[timer] > asoc->max_init_timeo) { asoc->timeouts[timer] = asoc->max_init_timeo; } asoc->init_cycle++; pr_debug("%s: T1[%s] timeout adjustment init_err_counter:%d" " cycle:%d timeout:%ld\n", __func__, name, asoc->init_err_counter, asoc->init_cycle, asoc->timeouts[timer]); } } /* Send the whole message, chunk by chunk, to the outqueue. * This way the whole message is queued up and bundling if * encouraged for small fragments. */ static void sctp_cmd_send_msg(struct sctp_association *asoc, struct sctp_datamsg *msg, gfp_t gfp) { struct sctp_chunk *chunk; list_for_each_entry(chunk, &msg->chunks, frag_list) sctp_outq_tail(&asoc->outqueue, chunk, gfp); asoc->outqueue.sched->enqueue(&asoc->outqueue, msg); } /* These three macros allow us to pull the debugging code out of the * main flow of sctp_do_sm() to keep attention focused on the real * functionality there. */ #define debug_pre_sfn() \ pr_debug("%s[pre-fn]: ep:%p, %s, %s, asoc:%p[%s], %s\n", __func__, \ ep, sctp_evttype_tbl[event_type], (*debug_fn)(subtype), \ asoc, sctp_state_tbl[state], state_fn->name) #define debug_post_sfn() \ pr_debug("%s[post-fn]: asoc:%p, status:%s\n", __func__, asoc, \ sctp_status_tbl[status]) #define debug_post_sfx() \ pr_debug("%s[post-sfx]: error:%d, asoc:%p[%s]\n", __func__, error, \ asoc, sctp_state_tbl[(asoc && sctp_id2assoc(ep->base.sk, \ sctp_assoc2id(asoc))) ? asoc->state : SCTP_STATE_CLOSED]) /* * This is the master state machine processing function. * * If you want to understand all of lksctp, this is a * good place to start. */ int sctp_do_sm(struct net *net, enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, gfp_t gfp) { typedef const char *(printfn_t)(union sctp_subtype); static printfn_t *table[] = { NULL, sctp_cname, sctp_tname, sctp_oname, sctp_pname, }; printfn_t *debug_fn __attribute__ ((unused)) = table[event_type]; const struct sctp_sm_table_entry *state_fn; struct sctp_cmd_seq commands; enum sctp_disposition status; int error = 0; /* Look up the state function, run it, and then process the * side effects. These three steps are the heart of lksctp. */ state_fn = sctp_sm_lookup_event(net, event_type, state, subtype); sctp_init_cmd_seq(&commands); debug_pre_sfn(); status = state_fn->fn(net, ep, asoc, subtype, event_arg, &commands); debug_post_sfn(); error = sctp_side_effects(event_type, subtype, state, ep, &asoc, event_arg, status, &commands, gfp); debug_post_sfx(); return error; } /***************************************************************** * This the master state function side effect processing function. *****************************************************************/ static int sctp_side_effects(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association **asoc, void *event_arg, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp) { int error; /* FIXME - Most of the dispositions left today would be categorized * as "exceptional" dispositions. For those dispositions, it * may not be proper to run through any of the commands at all. * For example, the command interpreter might be run only with * disposition SCTP_DISPOSITION_CONSUME. */ if (0 != (error = sctp_cmd_interpreter(event_type, subtype, state, ep, *asoc, event_arg, status, commands, gfp))) goto bail; switch (status) { case SCTP_DISPOSITION_DISCARD: pr_debug("%s: ignored sctp protocol event - state:%d, " "event_type:%d, event_id:%d\n", __func__, state, event_type, subtype.chunk); break; case SCTP_DISPOSITION_NOMEM: /* We ran out of memory, so we need to discard this * packet. */ /* BUG--we should now recover some memory, probably by * reneging... */ error = -ENOMEM; break; case SCTP_DISPOSITION_DELETE_TCB: case SCTP_DISPOSITION_ABORT: /* This should now be a command. */ *asoc = NULL; break; case SCTP_DISPOSITION_CONSUME: /* * We should no longer have much work to do here as the * real work has been done as explicit commands above. */ break; case SCTP_DISPOSITION_VIOLATION: net_err_ratelimited("protocol violation state %d chunkid %d\n", state, subtype.chunk); break; case SCTP_DISPOSITION_NOT_IMPL: pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n", state, event_type, subtype.chunk); break; case SCTP_DISPOSITION_BUG: pr_err("bug in state %d, event_type %d, event_id %d\n", state, event_type, subtype.chunk); BUG(); break; default: pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n", status, state, event_type, subtype.chunk); error = status; if (error >= 0) error = -EINVAL; WARN_ON_ONCE(1); break; } bail: return error; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* This is the side-effect interpreter. */ static int sctp_cmd_interpreter(enum sctp_event_type event_type, union sctp_subtype subtype, enum sctp_state state, struct sctp_endpoint *ep, struct sctp_association *asoc, void *event_arg, enum sctp_disposition status, struct sctp_cmd_seq *commands, gfp_t gfp) { struct sctp_sock *sp = sctp_sk(ep->base.sk); struct sctp_chunk *chunk = NULL, *new_obj; struct sctp_packet *packet; struct sctp_sackhdr sackh; struct timer_list *timer; struct sctp_transport *t; unsigned long timeout; struct sctp_cmd *cmd; int local_cork = 0; int error = 0; int force; if (SCTP_EVENT_T_TIMEOUT != event_type) chunk = event_arg; /* Note: This whole file is a huge candidate for rework. * For example, each command could either have its own handler, so * the loop would look like: * while (cmds) * cmd->handle(x, y, z) * --jgrimm */ while (NULL != (cmd = sctp_next_cmd(commands))) { switch (cmd->verb) { case SCTP_CMD_NOP: /* Do nothing. */ break; case SCTP_CMD_NEW_ASOC: /* Register a new association. */ if (local_cork) { sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } /* Register with the endpoint. */ asoc = cmd->obj.asoc; BUG_ON(asoc->peer.primary_path == NULL); sctp_endpoint_add_asoc(ep, asoc); break; case SCTP_CMD_PURGE_OUTQUEUE: sctp_outq_teardown(&asoc->outqueue); break; case SCTP_CMD_DELETE_TCB: if (local_cork) { sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } /* Delete the current association. */ sctp_cmd_delete_tcb(commands, asoc); asoc = NULL; break; case SCTP_CMD_NEW_STATE: /* Enter a new state. */ sctp_cmd_new_state(commands, asoc, cmd->obj.state); break; case SCTP_CMD_REPORT_TSN: /* Record the arrival of a TSN. */ error = sctp_tsnmap_mark(&asoc->peer.tsn_map, cmd->obj.u32, NULL); break; case SCTP_CMD_REPORT_FWDTSN: asoc->stream.si->report_ftsn(&asoc->ulpq, cmd->obj.u32); break; case SCTP_CMD_PROCESS_FWDTSN: asoc->stream.si->handle_ftsn(&asoc->ulpq, cmd->obj.chunk); break; case SCTP_CMD_GEN_SACK: /* Generate a Selective ACK. * The argument tells us whether to just count * the packet and MAYBE generate a SACK, or * force a SACK out. */ force = cmd->obj.i32; error = sctp_gen_sack(asoc, force, commands); break; case SCTP_CMD_PROCESS_SACK: /* Process an inbound SACK. */ error = sctp_cmd_process_sack(commands, asoc, cmd->obj.chunk); break; case SCTP_CMD_GEN_INIT_ACK: /* Generate an INIT ACK chunk. */ new_obj = sctp_make_init_ack(asoc, chunk, GFP_ATOMIC, 0); if (!new_obj) { error = -ENOMEM; break; } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); break; case SCTP_CMD_PEER_INIT: /* Process a unified INIT from the peer. * Note: Only used during INIT-ACK processing. If * there is an error just return to the outter * layer which will bail. */ error = sctp_cmd_process_init(commands, asoc, chunk, cmd->obj.init, gfp); break; case SCTP_CMD_GEN_COOKIE_ECHO: /* Generate a COOKIE ECHO chunk. */ new_obj = sctp_make_cookie_echo(asoc, chunk); if (!new_obj) { if (cmd->obj.chunk) sctp_chunk_free(cmd->obj.chunk); error = -ENOMEM; break; } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); /* If there is an ERROR chunk to be sent along with * the COOKIE_ECHO, send it, too. */ if (cmd->obj.chunk) sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(cmd->obj.chunk)); if (new_obj->transport) { new_obj->transport->init_sent_count++; asoc->init_last_sent_to = new_obj->transport; } /* FIXME - Eventually come up with a cleaner way to * enabling COOKIE-ECHO + DATA bundling during * multihoming stale cookie scenarios, the following * command plays with asoc->peer.retran_path to * avoid the problem of sending the COOKIE-ECHO and * DATA in different paths, which could result * in the association being ABORTed if the DATA chunk * is processed first by the server. Checking the * init error counter simply causes this command * to be executed only during failed attempts of * association establishment. */ if ((asoc->peer.retran_path != asoc->peer.primary_path) && (asoc->init_err_counter > 0)) { sctp_add_cmd_sf(commands, SCTP_CMD_FORCE_PRIM_RETRAN, SCTP_NULL()); } break; case SCTP_CMD_GEN_SHUTDOWN: /* Generate SHUTDOWN when in SHUTDOWN_SENT state. * Reset error counts. */ asoc->overall_error_count = 0; /* Generate a SHUTDOWN chunk. */ new_obj = sctp_make_shutdown(asoc, chunk); if (!new_obj) { error = -ENOMEM; break; } sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); break; case SCTP_CMD_CHUNK_ULP: /* Send a chunk to the sockets layer. */ pr_debug("%s: sm_sideff: chunk_up:%p, ulpq:%p\n", __func__, cmd->obj.chunk, &asoc->ulpq); asoc->stream.si->ulpevent_data(&asoc->ulpq, cmd->obj.chunk, GFP_ATOMIC); break; case SCTP_CMD_EVENT_ULP: /* Send a notification to the sockets layer. */ pr_debug("%s: sm_sideff: event_up:%p, ulpq:%p\n", __func__, cmd->obj.ulpevent, &asoc->ulpq); asoc->stream.si->enqueue_event(&asoc->ulpq, cmd->obj.ulpevent); break; case SCTP_CMD_REPLY: /* If an caller has not already corked, do cork. */ if (!asoc->outqueue.cork) { sctp_outq_cork(&asoc->outqueue); local_cork = 1; } /* Send a chunk to our peer. */ sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk, gfp); break; case SCTP_CMD_SEND_PKT: /* Send a full packet to our peer. */ packet = cmd->obj.packet; sctp_packet_transmit(packet, gfp); sctp_ootb_pkt_free(packet); break; case SCTP_CMD_T1_RETRAN: /* Mark a transport for retransmission. */ sctp_retransmit(&asoc->outqueue, cmd->obj.transport, SCTP_RTXR_T1_RTX); break; case SCTP_CMD_RETRAN: /* Mark a transport for retransmission. */ sctp_retransmit(&asoc->outqueue, cmd->obj.transport, SCTP_RTXR_T3_RTX); break; case SCTP_CMD_ECN_CE: /* Do delayed CE processing. */ sctp_do_ecn_ce_work(asoc, cmd->obj.u32); break; case SCTP_CMD_ECN_ECNE: /* Do delayed ECNE processing. */ new_obj = sctp_do_ecn_ecne_work(asoc, cmd->obj.u32, chunk); if (new_obj) sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(new_obj)); break; case SCTP_CMD_ECN_CWR: /* Do delayed CWR processing. */ sctp_do_ecn_cwr_work(asoc, cmd->obj.u32); break; case SCTP_CMD_SETUP_T2: sctp_cmd_setup_t2(commands, asoc, cmd->obj.chunk); break; case SCTP_CMD_TIMER_START_ONCE: timer = &asoc->timers[cmd->obj.to]; if (timer_pending(timer)) break; fallthrough; case SCTP_CMD_TIMER_START: timer = &asoc->timers[cmd->obj.to]; timeout = asoc->timeouts[cmd->obj.to]; BUG_ON(!timeout); /* * SCTP has a hard time with timer starts. Because we process * timer starts as side effects, it can be hard to tell if we * have already started a timer or not, which leads to BUG * halts when we call add_timer. So here, instead of just starting * a timer, if the timer is already started, and just mod * the timer with the shorter of the two expiration times */ if (!timer_pending(timer)) sctp_association_hold(asoc); timer_reduce(timer, jiffies + timeout); break; case SCTP_CMD_TIMER_RESTART: timer = &asoc->timers[cmd->obj.to]; timeout = asoc->timeouts[cmd->obj.to]; if (!mod_timer(timer, jiffies + timeout)) sctp_association_hold(asoc); break; case SCTP_CMD_TIMER_STOP: timer = &asoc->timers[cmd->obj.to]; if (del_timer(timer)) sctp_association_put(asoc); break; case SCTP_CMD_INIT_CHOOSE_TRANSPORT: chunk = cmd->obj.chunk; t = sctp_assoc_choose_alter_transport(asoc, asoc->init_last_sent_to); asoc->init_last_sent_to = t; chunk->transport = t; t->init_sent_count++; /* Set the new transport as primary */ sctp_assoc_set_primary(asoc, t); break; case SCTP_CMD_INIT_RESTART: /* Do the needed accounting and updates * associated with restarting an initialization * timer. Only multiply the timeout by two if * all transports have been tried at the current * timeout. */ sctp_cmd_t1_timer_update(asoc, SCTP_EVENT_TIMEOUT_T1_INIT, "INIT"); sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); break; case SCTP_CMD_COOKIEECHO_RESTART: /* Do the needed accounting and updates * associated with restarting an initialization * timer. Only multiply the timeout by two if * all transports have been tried at the current * timeout. */ sctp_cmd_t1_timer_update(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE, "COOKIE"); /* If we've sent any data bundled with * COOKIE-ECHO we need to resend. */ list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { sctp_retransmit_mark(&asoc->outqueue, t, SCTP_RTXR_T1_RTX); } sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); break; case SCTP_CMD_INIT_FAILED: sctp_cmd_init_failed(commands, asoc, cmd->obj.u16); break; case SCTP_CMD_ASSOC_FAILED: sctp_cmd_assoc_failed(commands, asoc, event_type, subtype, chunk, cmd->obj.u16); break; case SCTP_CMD_INIT_COUNTER_INC: asoc->init_err_counter++; break; case SCTP_CMD_INIT_COUNTER_RESET: asoc->init_err_counter = 0; asoc->init_cycle = 0; list_for_each_entry(t, &asoc->peer.transport_addr_list, transports) { t->init_sent_count = 0; } break; case SCTP_CMD_REPORT_DUP: sctp_tsnmap_mark_dup(&asoc->peer.tsn_map, cmd->obj.u32); break; case SCTP_CMD_REPORT_BAD_TAG: pr_debug("%s: vtag mismatch!\n", __func__); break; case SCTP_CMD_STRIKE: /* Mark one strike against a transport. */ sctp_do_8_2_transport_strike(commands, asoc, cmd->obj.transport, 0); break; case SCTP_CMD_TRANSPORT_IDLE: t = cmd->obj.transport; sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE); break; case SCTP_CMD_TRANSPORT_HB_SENT: t = cmd->obj.transport; sctp_do_8_2_transport_strike(commands, asoc, t, 1); t->hb_sent = 1; break; case SCTP_CMD_TRANSPORT_ON: t = cmd->obj.transport; sctp_cmd_transport_on(commands, asoc, t, chunk); break; case SCTP_CMD_HB_TIMERS_START: sctp_cmd_hb_timers_start(commands, asoc); break; case SCTP_CMD_HB_TIMER_UPDATE: t = cmd->obj.transport; sctp_transport_reset_hb_timer(t); break; case SCTP_CMD_HB_TIMERS_STOP: sctp_cmd_hb_timers_stop(commands, asoc); break; case SCTP_CMD_PROBE_TIMER_UPDATE: t = cmd->obj.transport; sctp_transport_reset_probe_timer(t); break; case SCTP_CMD_REPORT_ERROR: error = cmd->obj.error; break; case SCTP_CMD_PROCESS_CTSN: /* Dummy up a SACK for processing. */ sackh.cum_tsn_ack = cmd->obj.be32; sackh.a_rwnd = htonl(asoc->peer.rwnd + asoc->outqueue.outstanding_bytes); sackh.num_gap_ack_blocks = 0; sackh.num_dup_tsns = 0; chunk->subh.sack_hdr = &sackh; sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_CHUNK(chunk)); break; case SCTP_CMD_DISCARD_PACKET: /* We need to discard the whole packet. * Uncork the queue since there might be * responses pending */ chunk->pdiscard = 1; if (asoc) { sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } break; case SCTP_CMD_RTO_PENDING: t = cmd->obj.transport; t->rto_pending = 1; break; case SCTP_CMD_PART_DELIVER: asoc->stream.si->start_pd(&asoc->ulpq, GFP_ATOMIC); break; case SCTP_CMD_RENEGE: asoc->stream.si->renege_events(&asoc->ulpq, cmd->obj.chunk, GFP_ATOMIC); break; case SCTP_CMD_SETUP_T4: sctp_cmd_setup_t4(commands, asoc, cmd->obj.chunk); break; case SCTP_CMD_PROCESS_OPERR: sctp_cmd_process_operr(commands, asoc, chunk); break; case SCTP_CMD_CLEAR_INIT_TAG: asoc->peer.i.init_tag = 0; break; case SCTP_CMD_DEL_NON_PRIMARY: sctp_cmd_del_non_primary(asoc); break; case SCTP_CMD_T3_RTX_TIMERS_STOP: sctp_cmd_t3_rtx_timers_stop(commands, asoc); break; case SCTP_CMD_FORCE_PRIM_RETRAN: t = asoc->peer.retran_path; asoc->peer.retran_path = asoc->peer.primary_path; sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; asoc->peer.retran_path = t; break; case SCTP_CMD_SET_SK_ERR: sctp_cmd_set_sk_err(asoc, cmd->obj.error); break; case SCTP_CMD_ASSOC_CHANGE: sctp_cmd_assoc_change(commands, asoc, cmd->obj.u8); break; case SCTP_CMD_ADAPTATION_IND: sctp_cmd_adaptation_ind(commands, asoc); break; case SCTP_CMD_PEER_NO_AUTH: sctp_cmd_peer_no_auth(commands, asoc); break; case SCTP_CMD_ASSOC_SHKEY: error = sctp_auth_asoc_init_active_key(asoc, GFP_ATOMIC); break; case SCTP_CMD_UPDATE_INITTAG: asoc->peer.i.init_tag = cmd->obj.u32; break; case SCTP_CMD_SEND_MSG: if (!asoc->outqueue.cork) { sctp_outq_cork(&asoc->outqueue); local_cork = 1; } sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp); break; case SCTP_CMD_PURGE_ASCONF_QUEUE: sctp_asconf_queue_teardown(asoc); break; case SCTP_CMD_SET_ASOC: if (asoc && local_cork) { sctp_outq_uncork(&asoc->outqueue, gfp); local_cork = 0; } asoc = cmd->obj.asoc; break; default: pr_warn("Impossible command: %u\n", cmd->verb); break; } if (error) { cmd = sctp_next_cmd(commands); while (cmd) { if (cmd->verb == SCTP_CMD_REPLY) sctp_chunk_free(cmd->obj.chunk); cmd = sctp_next_cmd(commands); } break; } } /* If this is in response to a received chunk, wait until * we are done with the packet to open the queue so that we don't * send multiple packets in response to a single request. */ if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) { if (chunk->end_of_packet || chunk->singleton) sctp_outq_uncork(&asoc->outqueue, gfp); } else if (local_cork) sctp_outq_uncork(&asoc->outqueue, gfp); if (sp->data_ready_signalled) sp->data_ready_signalled = 0; return error; }
193 194 195 194 194 195 194 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 /* * kmod - the kernel module loader */ #include <linux/module.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/binfmts.h> #include <linux/syscalls.h> #include <linux/unistd.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/completion.h> #include <linux/cred.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/resource.h> #include <linux/notifier.h> #include <linux/suspend.h> #include <linux/rwsem.h> #include <linux/ptrace.h> #include <linux/async.h> #include <linux/uaccess.h> #include <trace/events/module.h> /* * Assuming: * * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE, * (u64) THREAD_SIZE * 8UL); * * If you need less than 50 threads would mean we're dealing with systems * smaller than 3200 pages. This assumes you are capable of having ~13M memory, * and this would only be an upper limit, after which the OOM killer would take * effect. Systems like these are very unlikely if modules are enabled. */ #define MAX_KMOD_CONCURRENT 50 static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); /* * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads * running at the same time without returning. When this happens we * believe you've somehow ended up with a recursive module dependency * creating a loop. * * We have no option but to fail. * * Userspace should proactively try to detect and prevent these. */ #define MAX_KMOD_ALL_BUSY_TIMEOUT 5 /* modprobe_path is set via /proc/sys. */ char modprobe_path[KMOD_PATH_LEN] = CONFIG_MODPROBE_PATH; static void free_modprobe_argv(struct subprocess_info *info) { kfree(info->argv[3]); /* check call_modprobe() */ kfree(info->argv); } static int call_modprobe(char *module_name, int wait) { struct subprocess_info *info; static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL); if (!argv) goto out; module_name = kstrdup(module_name, GFP_KERNEL); if (!module_name) goto free_argv; argv[0] = modprobe_path; argv[1] = "-q"; argv[2] = "--"; argv[3] = module_name; /* check free_modprobe_argv() */ argv[4] = NULL; info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL, NULL, free_modprobe_argv, NULL); if (!info) goto free_module_name; return call_usermodehelper_exec(info, wait | UMH_KILLABLE); free_module_name: kfree(module_name); free_argv: kfree(argv); out: return -ENOMEM; } /** * __request_module - try to load a kernel module * @wait: wait (or not) for the operation to complete * @fmt: printf style format string for the name of the module * @...: arguments as specified in the format string * * Load a module using the user mode module loader. The function returns * zero on success or a negative errno code or positive exit code from * "modprobe" on failure. Note that a successful module load does not mean * the module did not then unload and exit on an error of its own. Callers * must check that the service they requested is now available not blindly * invoke it. * * If module auto-loading support is disabled then this function * simply returns -ENOENT. */ int __request_module(bool wait, const char *fmt, ...) { va_list args; char module_name[MODULE_NAME_LEN]; int ret; /* * We don't allow synchronous module loading from async. Module * init may invoke async_synchronize_full() which will end up * waiting for this task which already is waiting for the module * loading to complete, leading to a deadlock. */ WARN_ON_ONCE(wait && current_is_async()); if (!modprobe_path[0]) return -ENOENT; va_start(args, fmt); ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); va_end(args); if (ret >= MODULE_NAME_LEN) return -ENAMETOOLONG; ret = security_kernel_module_request(module_name); if (ret) return ret; if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) { pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", atomic_read(&kmod_concurrent_max), MAX_KMOD_CONCURRENT, module_name); ret = wait_event_killable_timeout(kmod_wq, atomic_dec_if_positive(&kmod_concurrent_max) >= 0, MAX_KMOD_ALL_BUSY_TIMEOUT * HZ); if (!ret) { pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now", module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT); return -ETIME; } else if (ret == -ERESTARTSYS) { pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name); return ret; } } trace_module_request(module_name, wait, _RET_IP_); ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); atomic_inc(&kmod_concurrent_max); wake_up(&kmod_wq); return ret; } EXPORT_SYMBOL(__request_module);
244 246 76 76 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 // SPDX-License-Identifier: GPL-2.0 /* * linux/mm/swap_state.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds * Swap reorganised 29.12.95, Stephen Tweedie * * Rewritten to use page cache, (C) 1998 Stephen Tweedie */ #include <linux/mm.h> #include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/migrate.h> #include <linux/vmalloc.h> #include <linux/swap_slots.h> #include <linux/huge_mm.h> #include <linux/shmem_fs.h> #include "internal.h" /* * swapper_space is a fiction, retained to simplify the path through * vmscan's shrink_page_list. */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, .set_page_dirty = swap_set_page_dirty, #ifdef CONFIG_MIGRATION .migratepage = migrate_page, #endif }; struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; static bool enable_vma_readahead __read_mostly = true; #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) #define SWAP_RA_HITS_MAX SWAP_RA_HITS_MASK #define SWAP_RA_WIN_MASK (~PAGE_MASK & ~SWAP_RA_HITS_MASK) #define SWAP_RA_HITS(v) ((v) & SWAP_RA_HITS_MASK) #define SWAP_RA_WIN(v) (((v) & SWAP_RA_WIN_MASK) >> SWAP_RA_WIN_SHIFT) #define SWAP_RA_ADDR(v) ((v) & PAGE_MASK) #define SWAP_RA_VAL(addr, win, hits) \ (((addr) & PAGE_MASK) | \ (((win) << SWAP_RA_WIN_SHIFT) & SWAP_RA_WIN_MASK) | \ ((hits) & SWAP_RA_HITS_MASK)) /* Initial readahead hits is 4 to start up with a small window */ #define GET_SWAP_RA_VAL(vma) \ (atomic_long_read(&(vma)->swap_readahead_info) ? : 4) #define INC_CACHE_INFO(x) data_race(swap_cache_info.x++) #define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr)) static struct { unsigned long add_total; unsigned long del_total; unsigned long find_success; unsigned long find_total; } swap_cache_info; static atomic_t swapin_readahead_hits = ATOMIC_INIT(4); void show_swap_cache_info(void) { printk("%lu pages in swap cache\n", total_swapcache_pages()); printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n", swap_cache_info.add_total, swap_cache_info.del_total, swap_cache_info.find_success, swap_cache_info.find_total); printk("Free swap = %ldkB\n", get_nr_swap_pages() << (PAGE_SHIFT - 10)); printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10)); } void *get_shadow_from_swap_cache(swp_entry_t entry) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); struct page *page; page = xa_load(&address_space->i_pages, idx); if (xa_is_value(page)) return page; return NULL; } /* * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); unsigned long i, nr = thp_nr_pages(page); void *old; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); VM_BUG_ON_PAGE(!PageSwapBacked(page), page); page_ref_add(page, nr); SetPageSwapCache(page); do { xas_lock_irq(&xas); xas_create_range(&xas); if (xas_error(&xas)) goto unlock; for (i = 0; i < nr; i++) { VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); old = xas_load(&xas); if (xa_is_value(old)) { if (shadowp) *shadowp = old; } set_page_private(page + i, entry.val + i); xas_store(&xas, page); xas_next(&xas); } address_space->nrpages += nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); ADD_CACHE_INFO(add_total, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (!xas_error(&xas)) return 0; ClearPageSwapCache(page); page_ref_sub(page, nr); return xas_error(&xas); } /* * This must be called only on pages that have * been verified to be in the swap cache. */ void __delete_from_swap_cache(struct page *page, swp_entry_t entry, void *shadow) { struct address_space *address_space = swap_address_space(entry); int i, nr = thp_nr_pages(page); pgoff_t idx = swp_offset(entry); XA_STATE(xas, &address_space->i_pages, idx); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageSwapCache(page), page); VM_BUG_ON_PAGE(PageWriteback(page), page); for (i = 0; i < nr; i++) { void *entry = xas_store(&xas, shadow); VM_BUG_ON_PAGE(entry != page, entry); set_page_private(page + i, 0); xas_next(&xas); } ClearPageSwapCache(page); address_space->nrpages -= nr; __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); __mod_lruvec_page_state(page, NR_SWAPCACHE, -nr); ADD_CACHE_INFO(del_total, nr); } /** * add_to_swap - allocate swap space for a page * @page: page we want to move to swap * * Allocate swap space for the page and add the page to the * swap cache. Caller needs to hold the page lock. */ int add_to_swap(struct page *page) { swp_entry_t entry; int err; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageUptodate(page), page); entry = get_swap_page(page); if (!entry.val) return 0; /* * XArray node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC * stops emergency reserves from being allocated. * * TODO: this could cause a theoretical memory reclaim * deadlock in the swap out path. */ /* * Add it to the swap cache. */ err = add_to_swap_cache(page, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); if (err) /* * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ goto fail; /* * Normally the page will be dirtied in unmap because its pte should be * dirty. A special case is MADV_FREE page. The page's pte could have * dirty bit cleared but the page's SwapBacked bit is still set because * clearing the dirty bit and SwapBacked bit has no lock protected. For * such page, unmap will not set dirty bit for it, so page reclaim will * not write the page out. This can cause data corruption when the page * is swap in later. Always setting the dirty bit for the page solves * the problem. */ set_page_dirty(page); return 1; fail: put_swap_page(page, entry); return 0; } /* * This must be called only on pages that have * been verified to be in the swap cache and locked. * It will never put the page into the free list, * the caller has a reference on the page. */ void delete_from_swap_cache(struct page *page) { swp_entry_t entry = { .val = page_private(page) }; struct address_space *address_space = swap_address_space(entry); xa_lock_irq(&address_space->i_pages); __delete_from_swap_cache(page, entry, NULL); xa_unlock_irq(&address_space->i_pages); put_swap_page(page, entry); page_ref_sub(page, thp_nr_pages(page)); } void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end) { unsigned long curr = begin; void *old; for (;;) { swp_entry_t entry = swp_entry(type, curr); struct address_space *address_space = swap_address_space(entry); XA_STATE(xas, &address_space->i_pages, curr); xa_lock_irq(&address_space->i_pages); xas_for_each(&xas, old, end) { if (!xa_is_value(old)) continue; xas_store(&xas, NULL); } xa_unlock_irq(&address_space->i_pages); /* search the next swapcache until we meet end */ curr >>= SWAP_ADDRESS_SPACE_SHIFT; curr++; curr <<= SWAP_ADDRESS_SPACE_SHIFT; if (curr > end) break; } } /* * If we are the only user, then try to free up the swap cache. * * Its ok to check for PageSwapCache without the page lock * here because we are going to recheck again inside * try_to_free_swap() _with_ the lock. * - Marcelo */ void free_swap_cache(struct page *page) { if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { try_to_free_swap(page); unlock_page(page); } } /* * Perform a free_page(), also freeing any swap cache associated with * this page if it is the last user of the page. */ void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); if (!is_huge_zero_page(page)) put_page(page); } /* * Passed an array of pages, drop them all from swapcache and then release * them. They are removed from the LRU and freed if this is their last use. */ void free_pages_and_swap_cache(struct page **pages, int nr) { struct page **pagep = pages; int i; lru_add_drain(); for (i = 0; i < nr; i++) free_swap_cache(pagep[i]); release_pages(pagep, nr); } static inline bool swap_use_vma_readahead(void) { return READ_ONCE(enable_vma_readahead) && !atomic_read(&nr_rotate_swap); } /* * Lookup a swap entry in the swap cache. A found page will be returned * unlocked and with its refcount incremented - we rely on the kernel * lock getting page table operations atomic even if we drop the page * lock before returning. */ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr) { struct page *page; struct swap_info_struct *si; si = get_swap_device(entry); if (!si) return NULL; page = find_get_page(swap_address_space(entry), swp_offset(entry)); put_swap_device(si); INC_CACHE_INFO(find_total); if (page) { bool vma_ra = swap_use_vma_readahead(); bool readahead; INC_CACHE_INFO(find_success); /* * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. */ if (unlikely(PageTransCompound(page))) return page; readahead = TestClearPageReadahead(page); if (vma && vma_ra) { unsigned long ra_val; int win, hits; ra_val = GET_SWAP_RA_VAL(vma); win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); if (readahead) hits = min_t(int, hits + 1, SWAP_RA_HITS_MAX); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(addr, win, hits)); } if (readahead) { count_vm_event(SWAP_RA_HIT); if (!vma || !vma_ra) atomic_inc(&swapin_readahead_hits); } } return page; } /** * find_get_incore_page - Find and get a page from the page or swap caches. * @mapping: The address_space to search. * @index: The page cache index. * * This differs from find_get_page() in that it will also look for the * page in the swap cache. * * Return: The found page or %NULL. */ struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) { swp_entry_t swp; struct swap_info_struct *si; struct page *page = pagecache_get_page(mapping, index, FGP_ENTRY | FGP_HEAD, 0); if (!page) return page; if (!xa_is_value(page)) return find_subpage(page, index); if (!shmem_mapping(mapping)) return NULL; swp = radix_to_swp_entry(page); /* Prevent swapoff from happening to us */ si = get_swap_device(swp); if (!si) return NULL; page = find_get_page(swap_address_space(swp), swp_offset(swp)); put_swap_device(si); return page; } struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated) { struct swap_info_struct *si; struct page *page; void *shadow = NULL; *new_page_allocated = false; for (;;) { int err; /* * First check the swap cache. Since this is normally * called after lookup_swap_cache() failed, re-calling * that would confuse statistics. */ si = get_swap_device(entry); if (!si) return NULL; page = find_get_page(swap_address_space(entry), swp_offset(entry)); put_swap_device(si); if (page) return page; /* * Just skip read ahead for unused swap slot. * During swap_off when swap_slot_cache is disabled, * we have to handle the race between putting * swap entry in swap cache and marking swap slot * as SWAP_HAS_CACHE. That's done in later part of code or * else swap_off will be aborted if we return NULL. */ if (!__swp_swapcount(entry) && swap_slot_cache_enabled) return NULL; /* * Get a new page to read into from swap. Allocate it now, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ page = alloc_page_vma(gfp_mask, vma, addr); if (!page) return NULL; /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); if (!err) break; put_page(page); if (err != -EEXIST) return NULL; /* * We might race against __delete_from_swap_cache(), and * stumble across a swap_map entry whose SWAP_HAS_CACHE * has not yet been cleared. Or race against another * __read_swap_cache_async(), which has set SWAP_HAS_CACHE * in swap_map, but not yet added its page to swap cache. */ schedule_timeout_uninterruptible(1); } /* * The swap entry is ours to swap in. Prepare the new page. */ __SetPageLocked(page); __SetPageSwapBacked(page); if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; mem_cgroup_swapin_uncharge_swap(entry); if (shadow) workingset_refault(page, shadow); /* Caller will initiate read into locked page */ lru_cache_add(page); *new_page_allocated = true; return page; fail_unlock: put_swap_page(page, entry); unlock_page(page); put_page(page); return NULL; } /* * Locate a page of swap in physical memory, reserving swap cache space * and reading the disk if it is not already cached. * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, bool do_poll) { bool page_was_allocated; struct page *retpage = __read_swap_cache_async(entry, gfp_mask, vma, addr, &page_was_allocated); if (page_was_allocated) swap_readpage(retpage, do_poll); return retpage; } static unsigned int __swapin_nr_pages(unsigned long prev_offset, unsigned long offset, int hits, int max_pages, int prev_win) { unsigned int pages, last_ra; /* * This heuristic has been found to work well on both sequential and * random loads, swapping to hard disk or to SSD: please don't ask * what the "+ 2" means, it just happens to work well, that's all. */ pages = hits + 2; if (pages == 2) { /* * We can have no readahead hits to judge by: but must not get * stuck here forever, so check for an adjacent offset instead * (and don't even bother to check whether swap type is same). */ if (offset != prev_offset + 1 && offset != prev_offset - 1) pages = 1; } else { unsigned int roundup = 4; while (roundup < pages) roundup <<= 1; pages = roundup; } if (pages > max_pages) pages = max_pages; /* Don't shrink readahead too fast */ last_ra = prev_win / 2; if (pages < last_ra) pages = last_ra; return pages; } static unsigned long swapin_nr_pages(unsigned long offset) { static unsigned long prev_offset; unsigned int hits, pages, max_pages; static atomic_t last_readahead_pages; max_pages = 1 << READ_ONCE(page_cluster); if (max_pages <= 1) return 1; hits = atomic_xchg(&swapin_readahead_hits, 0); pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits, max_pages, atomic_read(&last_readahead_pages)); if (!hits) WRITE_ONCE(prev_offset, offset); atomic_set(&last_readahead_pages, pages); return pages; } /** * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read an aligned block of * (1 << page_cluster) entries in the swap area. This method is chosen * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * * This has been extended to use the NUMA policies from the mm triggering * the readahead. * * Caller must hold read mmap_lock if vmf->vma is not NULL. */ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { struct page *page; unsigned long entry_offset = swp_offset(entry); unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; bool do_poll = true, page_allocated; struct vm_area_struct *vma = vmf->vma; unsigned long addr = vmf->address; mask = swapin_nr_pages(offset) - 1; if (!mask) goto skip; do_poll = false; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; end_offset = offset | mask; if (!start_offset) /* First page is swap header. */ start_offset++; if (end_offset >= si->max) end_offset = si->max - 1; blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = __read_swap_cache_async( swp_entry(swp_type(entry), offset), gfp_mask, vma, addr, &page_allocated); if (!page) continue; if (page_allocated) { swap_readpage(page, false); if (offset != entry_offset) { SetPageReadahead(page); count_vm_event(SWAP_RA); } } put_page(page); } blk_finish_plug(&plug); lru_add_drain(); /* Push any new pages onto the LRU now */ skip: return read_swap_cache_async(entry, gfp_mask, vma, addr, do_poll); } int init_swap_address_space(unsigned int type, unsigned long nr_pages) { struct address_space *spaces, *space; unsigned int i, nr; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); spaces = kvcalloc(nr, sizeof(struct address_space), GFP_KERNEL); if (!spaces) return -ENOMEM; for (i = 0; i < nr; i++) { space = spaces + i; xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ); atomic_set(&space->i_mmap_writable, 0); space->a_ops = &swap_aops; /* swap cache doesn't use writeback related tags */ mapping_set_no_writeback_tags(space); } nr_swapper_spaces[type] = nr; swapper_spaces[type] = spaces; return 0; } void exit_swap_address_space(unsigned int type) { int i; struct address_space *spaces = swapper_spaces[type]; for (i = 0; i < nr_swapper_spaces[type]; i++) VM_WARN_ON_ONCE(!mapping_empty(&spaces[i])); kvfree(spaces); nr_swapper_spaces[type] = 0; swapper_spaces[type] = NULL; } static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma, unsigned long faddr, unsigned long lpfn, unsigned long rpfn, unsigned long *start, unsigned long *end) { *start = max3(lpfn, PFN_DOWN(vma->vm_start), PFN_DOWN(faddr & PMD_MASK)); *end = min3(rpfn, PFN_DOWN(vma->vm_end), PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE)); } static void swap_ra_info(struct vm_fault *vmf, struct vma_swap_readahead *ra_info) { struct vm_area_struct *vma = vmf->vma; unsigned long ra_val; unsigned long faddr, pfn, fpfn; unsigned long start, end; pte_t *pte, *orig_pte; unsigned int max_win, hits, prev_win, win, left; #ifndef CONFIG_64BIT pte_t *tpte; #endif max_win = 1 << min_t(unsigned int, READ_ONCE(page_cluster), SWAP_RA_ORDER_CEILING); if (max_win == 1) { ra_info->win = 1; return; } faddr = vmf->address; orig_pte = pte = pte_offset_map(vmf->pmd, faddr); fpfn = PFN_DOWN(faddr); ra_val = GET_SWAP_RA_VAL(vma); pfn = PFN_DOWN(SWAP_RA_ADDR(ra_val)); prev_win = SWAP_RA_WIN(ra_val); hits = SWAP_RA_HITS(ra_val); ra_info->win = win = __swapin_nr_pages(pfn, fpfn, hits, max_win, prev_win); atomic_long_set(&vma->swap_readahead_info, SWAP_RA_VAL(faddr, win, 0)); if (win == 1) { pte_unmap(orig_pte); return; } /* Copy the PTEs because the page table may be unmapped */ if (fpfn == pfn + 1) swap_ra_clamp_pfn(vma, faddr, fpfn, fpfn + win, &start, &end); else if (pfn == fpfn + 1) swap_ra_clamp_pfn(vma, faddr, fpfn - win + 1, fpfn + 1, &start, &end); else { left = (win - 1) / 2; swap_ra_clamp_pfn(vma, faddr, fpfn - left, fpfn + win - left, &start, &end); } ra_info->nr_pte = end - start; ra_info->offset = fpfn - start; pte -= ra_info->offset; #ifdef CONFIG_64BIT ra_info->ptes = pte; #else tpte = ra_info->ptes; for (pfn = start; pfn != end; pfn++) *tpte++ = *pte++; #endif pte_unmap(orig_pte); } /** * swap_vma_readahead - swap in pages in hope we need them soon * @fentry: swap entry of this memory * @gfp_mask: memory allocation flags * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. * * Primitive swap readahead code. We simply read in a few pages whose * virtual addresses are around the fault address in the same vma. * * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ s