Total coverage: 174966 (15%)of 1223181
81 55 55 36 35 36 10 2 29 45 28 17 17 55 55 55 47 12 55 55 36 29 9 9 9 9 36 9 50 42 6 8 6 12 32 31 3 1 1 2 83 83 83 45 41 27 45 45 83 83 83 62 17 48 48 46 2 2 1 53 53 53 65 65 65 65 65 55 55 50 49 32 27 36 36 50 1 7 5 2 1 1 1 2 2 1 1 1 1 1 4 2 2 2 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich */ #include "bridge_loop_avoidance.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/crc16.h> #include <linux/crc32.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/string_choices.h> #include <linux/workqueue.h> #include <net/arp.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "originator.h" #include "translation-table.h" static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; static void batadv_bla_periodic_work(struct work_struct *work); static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw); /** * batadv_choose_claim() - choose the right bucket for a claim. * @data: data to hash * @size: size of the hash table * * Return: the hash index of the claim */ static inline u32 batadv_choose_claim(const void *data, u32 size) { const struct batadv_bla_claim *claim = data; u32 hash = 0; hash = jhash(&claim->addr, sizeof(claim->addr), hash); hash = jhash(&claim->vid, sizeof(claim->vid), hash); return hash % size; } /** * batadv_choose_backbone_gw() - choose the right bucket for a backbone gateway. * @data: data to hash * @size: size of the hash table * * Return: the hash index of the backbone gateway */ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) { const struct batadv_bla_backbone_gw *gw; u32 hash = 0; gw = data; hash = jhash(&gw->orig, sizeof(gw->orig), hash); hash = jhash(&gw->vid, sizeof(gw->vid), hash); return hash % size; } /** * batadv_compare_backbone_gw() - compare address and vid of two backbone gws * @node: list node of the first entry to compare * @data2: pointer to the second backbone gateway * * Return: true if the backbones have the same data, false otherwise */ static bool batadv_compare_backbone_gw(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_bla_backbone_gw, hash_entry); const struct batadv_bla_backbone_gw *gw1 = data1; const struct batadv_bla_backbone_gw *gw2 = data2; if (!batadv_compare_eth(gw1->orig, gw2->orig)) return false; if (gw1->vid != gw2->vid) return false; return true; } /** * batadv_compare_claim() - compare address and vid of two claims * @node: list node of the first entry to compare * @data2: pointer to the second claims * * Return: true if the claim have the same data, 0 otherwise */ static bool batadv_compare_claim(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_bla_claim, hash_entry); const struct batadv_bla_claim *cl1 = data1; const struct batadv_bla_claim *cl2 = data2; if (!batadv_compare_eth(cl1->addr, cl2->addr)) return false; if (cl1->vid != cl2->vid) return false; return true; } /** * batadv_backbone_gw_release() - release backbone gw from lists and queue for * free after rcu grace period * @ref: kref pointer of the backbone gw */ static void batadv_backbone_gw_release(struct kref *ref) { struct batadv_bla_backbone_gw *backbone_gw; backbone_gw = container_of(ref, struct batadv_bla_backbone_gw, refcount); kfree_rcu(backbone_gw, rcu); } /** * batadv_backbone_gw_put() - decrement the backbone gw refcounter and possibly * release it * @backbone_gw: backbone gateway to be free'd */ static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw) { if (!backbone_gw) return; kref_put(&backbone_gw->refcount, batadv_backbone_gw_release); } /** * batadv_claim_release() - release claim from lists and queue for free after * rcu grace period * @ref: kref pointer of the claim */ static void batadv_claim_release(struct kref *ref) { struct batadv_bla_claim *claim; struct batadv_bla_backbone_gw *old_backbone_gw; claim = container_of(ref, struct batadv_bla_claim, refcount); spin_lock_bh(&claim->backbone_lock); old_backbone_gw = claim->backbone_gw; claim->backbone_gw = NULL; spin_unlock_bh(&claim->backbone_lock); spin_lock_bh(&old_backbone_gw->crc_lock); old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&old_backbone_gw->crc_lock); batadv_backbone_gw_put(old_backbone_gw); kfree_rcu(claim, rcu); } /** * batadv_claim_put() - decrement the claim refcounter and possibly release it * @claim: claim to be free'd */ static void batadv_claim_put(struct batadv_bla_claim *claim) { if (!claim) return; kref_put(&claim->refcount, batadv_claim_release); } /** * batadv_claim_hash_find() - looks for a claim in the claim hash * @bat_priv: the bat priv with all the mesh interface information * @data: search data (may be local/static data) * * Return: claim if found or NULL otherwise. */ static struct batadv_bla_claim * batadv_claim_hash_find(struct batadv_priv *bat_priv, struct batadv_bla_claim *data) { struct batadv_hashtable *hash = bat_priv->bla.claim_hash; struct hlist_head *head; struct batadv_bla_claim *claim; struct batadv_bla_claim *claim_tmp = NULL; int index; if (!hash) return NULL; index = batadv_choose_claim(data, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { if (!batadv_compare_claim(&claim->hash_entry, data)) continue; if (!kref_get_unless_zero(&claim->refcount)) continue; claim_tmp = claim; break; } rcu_read_unlock(); return claim_tmp; } /** * batadv_backbone_hash_find() - looks for a backbone gateway in the hash * @bat_priv: the bat priv with all the mesh interface information * @addr: the address of the originator * @vid: the VLAN ID * * Return: backbone gateway if found or NULL otherwise */ static struct batadv_bla_backbone_gw * batadv_backbone_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; struct hlist_head *head; struct batadv_bla_backbone_gw search_entry, *backbone_gw; struct batadv_bla_backbone_gw *backbone_gw_tmp = NULL; int index; if (!hash) return NULL; ether_addr_copy(search_entry.orig, addr); search_entry.vid = vid; index = batadv_choose_backbone_gw(&search_entry, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (!batadv_compare_backbone_gw(&backbone_gw->hash_entry, &search_entry)) continue; if (!kref_get_unless_zero(&backbone_gw->refcount)) continue; backbone_gw_tmp = backbone_gw; break; } rcu_read_unlock(); return backbone_gw_tmp; } /** * batadv_bla_del_backbone_claims() - delete all claims for a backbone * @backbone_gw: backbone gateway where the claims should be removed */ static void batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) { struct batadv_hashtable *hash; struct hlist_node *node_tmp; struct hlist_head *head; struct batadv_bla_claim *claim; int i; spinlock_t *list_lock; /* protects write access to the hash lists */ hash = backbone_gw->bat_priv->bla.claim_hash; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(claim, node_tmp, head, hash_entry) { if (claim->backbone_gw != backbone_gw) continue; batadv_claim_put(claim); hlist_del_rcu(&claim->hash_entry); } spin_unlock_bh(list_lock); } /* all claims gone, initialize CRC */ spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc = BATADV_BLA_CRC_INIT; spin_unlock_bh(&backbone_gw->crc_lock); } /** * batadv_bla_send_claim() - sends a claim frame according to the provided info * @bat_priv: the bat priv with all the mesh interface information * @mac: the mac address to be announced within the claim * @vid: the VLAN ID * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...) */ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac, unsigned short vid, int claimtype) { struct sk_buff *skb; struct ethhdr *ethhdr; struct batadv_hard_iface *primary_if; struct net_device *mesh_iface; u8 *hw_src; struct batadv_bla_claim_dst local_claim_dest; __be32 zeroip = 0; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return; memcpy(&local_claim_dest, &bat_priv->bla.claim_dest, sizeof(local_claim_dest)); local_claim_dest.type = claimtype; mesh_iface = primary_if->mesh_iface; skb = arp_create(ARPOP_REPLY, ETH_P_ARP, /* IP DST: 0.0.0.0 */ zeroip, primary_if->mesh_iface, /* IP SRC: 0.0.0.0 */ zeroip, /* Ethernet DST: Broadcast */ NULL, /* Ethernet SRC/HW SRC: originator mac */ primary_if->net_dev->dev_addr, /* HW DST: FF:43:05:XX:YY:YY * with XX = claim type * and YY:YY = group id */ (u8 *)&local_claim_dest); if (!skb) goto out; ethhdr = (struct ethhdr *)skb->data; hw_src = (u8 *)ethhdr + ETH_HLEN + sizeof(struct arphdr); /* now we pretend that the client would have sent this ... */ switch (claimtype) { case BATADV_CLAIM_TYPE_CLAIM: /* normal claim frame * set Ethernet SRC to the clients mac */ ether_addr_copy(ethhdr->h_source, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): CLAIM %pM on vid %d\n", __func__, mac, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_UNCLAIM: /* unclaim frame * set HW SRC to the clients mac */ ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): UNCLAIM %pM on vid %d\n", __func__, mac, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_ANNOUNCE: /* announcement frame * set HW SRC to the special mac containing the crc */ ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): ANNOUNCE of %pM on vid %d\n", __func__, ethhdr->h_source, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_REQUEST: /* request frame * set HW SRC and header destination to the receiving backbone * gws mac */ ether_addr_copy(hw_src, mac); ether_addr_copy(ethhdr->h_dest, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): REQUEST of %pM to %pM on vid %d\n", __func__, ethhdr->h_source, ethhdr->h_dest, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_LOOPDETECT: ether_addr_copy(ethhdr->h_source, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): LOOPDETECT of %pM to %pM on vid %d\n", __func__, ethhdr->h_source, ethhdr->h_dest, batadv_print_vid(vid)); break; } if (vid & BATADV_VLAN_HAS_TAG) { skb = vlan_insert_tag(skb, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); if (!skb) goto out; } skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, mesh_iface); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); netif_rx(skb); out: batadv_hardif_put(primary_if); } /** * batadv_bla_loopdetect_report() - worker for reporting the loop * @work: work queue item * * Throws an uevent, as the loopdetect check function can't do that itself * since the kernel may sleep while throwing uevents. */ static void batadv_bla_loopdetect_report(struct work_struct *work) { struct batadv_bla_backbone_gw *backbone_gw; struct batadv_priv *bat_priv; char vid_str[6] = { '\0' }; backbone_gw = container_of(work, struct batadv_bla_backbone_gw, report_work); bat_priv = backbone_gw->bat_priv; batadv_info(bat_priv->mesh_iface, "Possible loop on VLAN %d detected which can't be handled by BLA - please check your network setup!\n", batadv_print_vid(backbone_gw->vid)); snprintf(vid_str, sizeof(vid_str), "%d", batadv_print_vid(backbone_gw->vid)); vid_str[sizeof(vid_str) - 1] = 0; batadv_throw_uevent(bat_priv, BATADV_UEV_BLA, BATADV_UEV_LOOPDETECT, vid_str); batadv_backbone_gw_put(backbone_gw); } /** * batadv_bla_get_backbone_gw() - finds or creates a backbone gateway * @bat_priv: the bat priv with all the mesh interface information * @orig: the mac address of the originator * @vid: the VLAN ID * @own_backbone: set if the requested backbone is local * * Return: the (possibly created) backbone gateway or NULL on error */ static struct batadv_bla_backbone_gw * batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig, unsigned short vid, bool own_backbone) { struct batadv_bla_backbone_gw *entry; struct batadv_orig_node *orig_node; int hash_added; entry = batadv_backbone_hash_find(bat_priv, orig, vid); if (entry) return entry; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): not found (%pM, %d), creating new entry\n", __func__, orig, batadv_print_vid(vid)); entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return NULL; entry->vid = vid; entry->lasttime = jiffies; entry->crc = BATADV_BLA_CRC_INIT; entry->bat_priv = bat_priv; spin_lock_init(&entry->crc_lock); atomic_set(&entry->request_sent, 0); atomic_set(&entry->wait_periods, 0); ether_addr_copy(entry->orig, orig); INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report); kref_init(&entry->refcount); kref_get(&entry->refcount); hash_added = batadv_hash_add(bat_priv->bla.backbone_hash, batadv_compare_backbone_gw, batadv_choose_backbone_gw, entry, &entry->hash_entry); if (unlikely(hash_added != 0)) { /* hash failed, free the structure */ kfree(entry); return NULL; } /* this is a gateway now, remove any TT entry on this VLAN */ orig_node = batadv_orig_hash_find(bat_priv, orig); if (orig_node) { batadv_tt_global_del_orig(bat_priv, orig_node, vid, "became a backbone gateway"); batadv_orig_node_put(orig_node); } if (own_backbone) { batadv_bla_send_announce(bat_priv, entry); /* this will be decreased in the worker thread */ atomic_inc(&entry->request_sent); atomic_set(&entry->wait_periods, BATADV_BLA_WAIT_PERIODS); atomic_inc(&bat_priv->bla.num_requests); } return entry; } /** * batadv_bla_update_own_backbone_gw() - updates the own backbone gw for a VLAN * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the selected primary interface * @vid: VLAN identifier * * update or add the own backbone gw to make sure we announce * where we receive other backbone gws */ static void batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; backbone_gw = batadv_bla_get_backbone_gw(bat_priv, primary_if->net_dev->dev_addr, vid, true); if (unlikely(!backbone_gw)) return; backbone_gw->lasttime = jiffies; batadv_backbone_gw_put(backbone_gw); } /** * batadv_bla_answer_request() - answer a bla request by sending own claims * @bat_priv: the bat priv with all the mesh interface information * @primary_if: interface where the request came on * @vid: the vid where the request came on * * Repeat all of our own claims, and finally send an ANNOUNCE frame * to allow the requester another check if the CRC is correct now. */ static void batadv_bla_answer_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, unsigned short vid) { struct hlist_head *head; struct batadv_hashtable *hash; struct batadv_bla_claim *claim; struct batadv_bla_backbone_gw *backbone_gw; int i; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): received a claim request, send all of our own claims again\n", __func__); backbone_gw = batadv_backbone_hash_find(bat_priv, primary_if->net_dev->dev_addr, vid); if (!backbone_gw) return; hash = bat_priv->bla.claim_hash; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { /* only own claims are interesting */ if (claim->backbone_gw != backbone_gw) continue; batadv_bla_send_claim(bat_priv, claim->addr, claim->vid, BATADV_CLAIM_TYPE_CLAIM); } rcu_read_unlock(); } /* finally, send an announcement frame */ batadv_bla_send_announce(bat_priv, backbone_gw); batadv_backbone_gw_put(backbone_gw); } /** * batadv_bla_send_request() - send a request to repeat claims * @backbone_gw: the backbone gateway from whom we are out of sync * * When the crc is wrong, ask the backbone gateway for a full table update. * After the request, it will repeat all of his own claims and finally * send an announcement claim with which we can check again. */ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) { /* first, remove all old entries */ batadv_bla_del_backbone_claims(backbone_gw); batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "Sending REQUEST to %pM\n", backbone_gw->orig); /* send request */ batadv_bla_send_claim(backbone_gw->bat_priv, backbone_gw->orig, backbone_gw->vid, BATADV_CLAIM_TYPE_REQUEST); /* no local broadcasts should be sent or received, for now. */ if (!atomic_read(&backbone_gw->request_sent)) { atomic_inc(&backbone_gw->bat_priv->bla.num_requests); atomic_set(&backbone_gw->request_sent, 1); } } /** * batadv_bla_send_announce() - Send an announcement frame * @bat_priv: the bat priv with all the mesh interface information * @backbone_gw: our backbone gateway which should be announced */ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw) { u8 mac[ETH_ALEN]; __be16 crc; memcpy(mac, batadv_announce_mac, 4); spin_lock_bh(&backbone_gw->crc_lock); crc = htons(backbone_gw->crc); spin_unlock_bh(&backbone_gw->crc_lock); memcpy(&mac[4], &crc, 2); batadv_bla_send_claim(bat_priv, mac, backbone_gw->vid, BATADV_CLAIM_TYPE_ANNOUNCE); } /** * batadv_bla_add_claim() - Adds a claim in the claim hash * @bat_priv: the bat priv with all the mesh interface information * @mac: the mac address of the claim * @vid: the VLAN ID of the frame * @backbone_gw: the backbone gateway which claims it */ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid, struct batadv_bla_backbone_gw *backbone_gw) { struct batadv_bla_backbone_gw *old_backbone_gw; struct batadv_bla_claim *claim; struct batadv_bla_claim search_claim; bool remove_crc = false; int hash_added; ether_addr_copy(search_claim.addr, mac); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); /* create a new claim entry if it does not exist yet. */ if (!claim) { claim = kzalloc(sizeof(*claim), GFP_ATOMIC); if (!claim) return; ether_addr_copy(claim->addr, mac); spin_lock_init(&claim->backbone_lock); claim->vid = vid; claim->lasttime = jiffies; kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; kref_init(&claim->refcount); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): adding new entry %pM, vid %d to hash ...\n", __func__, mac, batadv_print_vid(vid)); kref_get(&claim->refcount); hash_added = batadv_hash_add(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim, &claim->hash_entry); if (unlikely(hash_added != 0)) { /* only local changes happened. */ kfree(claim); return; } } else { claim->lasttime = jiffies; if (claim->backbone_gw == backbone_gw) /* no need to register a new backbone */ goto claim_free_ref; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): changing ownership for %pM, vid %d to gw %pM\n", __func__, mac, batadv_print_vid(vid), backbone_gw->orig); remove_crc = true; } /* replace backbone_gw atomically and adjust reference counters */ spin_lock_bh(&claim->backbone_lock); old_backbone_gw = claim->backbone_gw; kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; spin_unlock_bh(&claim->backbone_lock); if (remove_crc) { /* remove claim address from old backbone_gw */ spin_lock_bh(&old_backbone_gw->crc_lock); old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&old_backbone_gw->crc_lock); } batadv_backbone_gw_put(old_backbone_gw); /* add claim address to new backbone_gw */ spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&backbone_gw->crc_lock); backbone_gw->lasttime = jiffies; claim_free_ref: batadv_claim_put(claim); } /** * batadv_bla_claim_get_backbone_gw() - Get valid reference for backbone_gw of * claim * @claim: claim whose backbone_gw should be returned * * Return: valid reference to claim::backbone_gw */ static struct batadv_bla_backbone_gw * batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim) { struct batadv_bla_backbone_gw *backbone_gw; spin_lock_bh(&claim->backbone_lock); backbone_gw = claim->backbone_gw; kref_get(&backbone_gw->refcount); spin_unlock_bh(&claim->backbone_lock); return backbone_gw; } /** * batadv_bla_del_claim() - delete a claim from the claim hash * @bat_priv: the bat priv with all the mesh interface information * @mac: mac address of the claim to be removed * @vid: VLAN id for the claim to be removed */ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid) { struct batadv_bla_claim search_claim, *claim; struct batadv_bla_claim *claim_removed_entry; struct hlist_node *claim_removed_node; ether_addr_copy(search_claim.addr, mac); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); if (!claim) return; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): %pM, vid %d\n", __func__, mac, batadv_print_vid(vid)); claim_removed_node = batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim); if (!claim_removed_node) goto free_claim; /* reference from the hash is gone */ claim_removed_entry = hlist_entry(claim_removed_node, struct batadv_bla_claim, hash_entry); batadv_claim_put(claim_removed_entry); free_claim: /* don't need the reference from hash_find() anymore */ batadv_claim_put(claim); } /** * batadv_handle_announce() - check for ANNOUNCE frame * @bat_priv: the bat priv with all the mesh interface information * @an_addr: announcement mac address (ARP Sender HW address) * @backbone_addr: originator address of the sender (Ethernet source MAC) * @vid: the VLAN ID of the frame * * Return: true if handled */ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, u8 *backbone_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; u16 backbone_crc, crc; if (memcmp(an_addr, batadv_announce_mac, 4) != 0) return false; backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid, false); if (unlikely(!backbone_gw)) return true; /* handle as ANNOUNCE frame */ backbone_gw->lasttime = jiffies; crc = ntohs(*((__force __be16 *)(&an_addr[4]))); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n", __func__, batadv_print_vid(vid), backbone_gw->orig, crc); spin_lock_bh(&backbone_gw->crc_lock); backbone_crc = backbone_gw->crc; spin_unlock_bh(&backbone_gw->crc_lock); if (backbone_crc != crc) { batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "%s(): CRC FAILED for %pM/%d (my = %#.4x, sent = %#.4x)\n", __func__, backbone_gw->orig, batadv_print_vid(backbone_gw->vid), backbone_crc, crc); batadv_bla_send_request(backbone_gw); } else { /* if we have sent a request and the crc was OK, * we can allow traffic again. */ if (atomic_read(&backbone_gw->request_sent)) { atomic_dec(&backbone_gw->bat_priv->bla.num_requests); atomic_set(&backbone_gw->request_sent, 0); } } batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_handle_request() - check for REQUEST frame * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the primary hard interface of this batman mesh interface * @backbone_addr: backbone address to be requested (ARP sender HW MAC) * @ethhdr: ethernet header of a packet * @vid: the VLAN ID of the frame * * Return: true if handled */ static bool batadv_handle_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *backbone_addr, struct ethhdr *ethhdr, unsigned short vid) { /* check for REQUEST frame */ if (!batadv_compare_eth(backbone_addr, ethhdr->h_dest)) return false; /* sanity check, this should not happen on a normal switch, * we ignore it in this case. */ if (!batadv_compare_eth(ethhdr->h_dest, primary_if->net_dev->dev_addr)) return true; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): REQUEST vid %d (sent by %pM)...\n", __func__, batadv_print_vid(vid), ethhdr->h_source); batadv_bla_answer_request(bat_priv, primary_if, vid); return true; } /** * batadv_handle_unclaim() - check for UNCLAIM frame * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the primary hard interface of this batman mesh interface * @backbone_addr: originator address of the backbone (Ethernet source) * @claim_addr: Client to be unclaimed (ARP sender HW MAC) * @vid: the VLAN ID of the frame * * Return: true if handled */ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, const u8 *backbone_addr, const u8 *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; /* unclaim in any case if it is our own */ if (primary_if && batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr)) batadv_bla_send_claim(bat_priv, claim_addr, vid, BATADV_CLAIM_TYPE_UNCLAIM); backbone_gw = batadv_backbone_hash_find(bat_priv, backbone_addr, vid); if (!backbone_gw) return true; /* this must be an UNCLAIM frame */ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): UNCLAIM %pM on vid %d (sent by %pM)...\n", __func__, claim_addr, batadv_print_vid(vid), backbone_gw->orig); batadv_bla_del_claim(bat_priv, claim_addr, vid); batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_handle_claim() - check for CLAIM frame * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the primary hard interface of this batman mesh interface * @backbone_addr: originator address of the backbone (Ethernet Source) * @claim_addr: client mac address to be claimed (ARP sender HW MAC) * @vid: the VLAN ID of the frame * * Return: true if handled */ static bool batadv_handle_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, const u8 *backbone_addr, const u8 *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; /* register the gateway if not yet available, and add the claim. */ backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid, false); if (unlikely(!backbone_gw)) return true; /* this must be a CLAIM frame */ batadv_bla_add_claim(bat_priv, claim_addr, vid, backbone_gw); if (batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr)) batadv_bla_send_claim(bat_priv, claim_addr, vid, BATADV_CLAIM_TYPE_CLAIM); /* TODO: we could call something like tt_local_del() here. */ batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_check_claim_group() - check for claim group membership * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the primary interface of this batman interface * @hw_src: the Hardware source in the ARP Header * @hw_dst: the Hardware destination in the ARP Header * @ethhdr: pointer to the Ethernet header of the claim frame * * checks if it is a claim packet and if it's on the same group. * This function also applies the group ID of the sender * if it is in the same mesh. * * Return: * 2 - if it is a claim packet and on the same group * 1 - if is a claim packet from another group * 0 - if it is not a claim packet */ static int batadv_check_claim_group(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *hw_src, u8 *hw_dst, struct ethhdr *ethhdr) { u8 *backbone_addr; struct batadv_orig_node *orig_node; struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; bla_dst = (struct batadv_bla_claim_dst *)hw_dst; bla_dst_own = &bat_priv->bla.claim_dest; /* if announcement packet, use the source, * otherwise assume it is in the hw_src */ switch (bla_dst->type) { case BATADV_CLAIM_TYPE_CLAIM: backbone_addr = hw_src; break; case BATADV_CLAIM_TYPE_REQUEST: case BATADV_CLAIM_TYPE_ANNOUNCE: case BATADV_CLAIM_TYPE_UNCLAIM: backbone_addr = ethhdr->h_source; break; default: return 0; } /* don't accept claim frames from ourselves */ if (batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr)) return 0; /* if its already the same group, it is fine. */ if (bla_dst->group == bla_dst_own->group) return 2; /* lets see if this originator is in our mesh */ orig_node = batadv_orig_hash_find(bat_priv, backbone_addr); /* don't accept claims from gateways which are not in * the same mesh or group. */ if (!orig_node) return 1; /* if our mesh friends mac is bigger, use it for ourselves. */ if (ntohs(bla_dst->group) > ntohs(bla_dst_own->group)) { batadv_dbg(BATADV_DBG_BLA, bat_priv, "taking other backbones claim group: %#.4x\n", ntohs(bla_dst->group)); bla_dst_own->group = bla_dst->group; } batadv_orig_node_put(orig_node); return 2; } /** * batadv_bla_process_claim() - Check if this is a claim frame, and process it * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the primary hard interface of this batman mesh interface * @skb: the frame to be checked * * Return: true if it was a claim frame, otherwise return false to * tell the callee that it can use the frame on its own. */ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct sk_buff *skb) { struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; u8 *hw_src, *hw_dst; struct vlan_hdr *vhdr, vhdr_buf; struct ethhdr *ethhdr; struct arphdr *arphdr; unsigned short vid; int vlan_depth = 0; __be16 proto; int headlen; int ret; vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; headlen = ETH_HLEN; if (vid & BATADV_VLAN_HAS_TAG) { /* Traverse the VLAN/Ethertypes. * * At this point it is known that the first protocol is a VLAN * header, so start checking at the encapsulated protocol. * * The depth of the VLAN headers is recorded to drop BLA claim * frames encapsulated into multiple VLAN headers (QinQ). */ do { vhdr = skb_header_pointer(skb, headlen, VLAN_HLEN, &vhdr_buf); if (!vhdr) return false; proto = vhdr->h_vlan_encapsulated_proto; headlen += VLAN_HLEN; vlan_depth++; } while (proto == htons(ETH_P_8021Q)); } if (proto != htons(ETH_P_ARP)) return false; /* not a claim frame */ /* this must be a ARP frame. check if it is a claim. */ if (unlikely(!pskb_may_pull(skb, headlen + arp_hdr_len(skb->dev)))) return false; /* pskb_may_pull() may have modified the pointers, get ethhdr again */ ethhdr = eth_hdr(skb); arphdr = (struct arphdr *)((u8 *)ethhdr + headlen); /* Check whether the ARP frame carries a valid * IP information */ if (arphdr->ar_hrd != htons(ARPHRD_ETHER)) return false; if (arphdr->ar_pro != htons(ETH_P_IP)) return false; if (arphdr->ar_hln != ETH_ALEN) return false; if (arphdr->ar_pln != 4) return false; hw_src = (u8 *)arphdr + sizeof(struct arphdr); hw_dst = hw_src + ETH_ALEN + 4; bla_dst = (struct batadv_bla_claim_dst *)hw_dst; bla_dst_own = &bat_priv->bla.claim_dest; /* check if it is a claim frame in general */ if (memcmp(bla_dst->magic, bla_dst_own->magic, sizeof(bla_dst->magic)) != 0) return false; /* check if there is a claim frame encapsulated deeper in (QinQ) and * drop that, as this is not supported by BLA but should also not be * sent via the mesh. */ if (vlan_depth > 1) return true; /* Let the loopdetect frames on the mesh in any case. */ if (bla_dst->type == BATADV_CLAIM_TYPE_LOOPDETECT) return false; /* check if it is a claim frame. */ ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst, ethhdr); if (ret == 1) batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): received a claim frame from another group. From: %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n", __func__, ethhdr->h_source, batadv_print_vid(vid), hw_src, hw_dst); if (ret < 2) return !!ret; /* become a backbone gw ourselves on this vlan if not happened yet */ batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); /* check for the different types of claim frames ... */ switch (bla_dst->type) { case BATADV_CLAIM_TYPE_CLAIM: if (batadv_handle_claim(bat_priv, primary_if, hw_src, ethhdr->h_source, vid)) return true; break; case BATADV_CLAIM_TYPE_UNCLAIM: if (batadv_handle_unclaim(bat_priv, primary_if, ethhdr->h_source, hw_src, vid)) return true; break; case BATADV_CLAIM_TYPE_ANNOUNCE: if (batadv_handle_announce(bat_priv, hw_src, ethhdr->h_source, vid)) return true; break; case BATADV_CLAIM_TYPE_REQUEST: if (batadv_handle_request(bat_priv, primary_if, hw_src, ethhdr, vid)) return true; break; } batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): ERROR - this looks like a claim frame, but is useless. eth src %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n", __func__, ethhdr->h_source, batadv_print_vid(vid), hw_src, hw_dst); return true; } /** * batadv_bla_purge_backbone_gw() - Remove backbone gateways after a timeout or * immediately * @bat_priv: the bat priv with all the mesh interface information * @now: whether the whole hash shall be wiped now * * Check when we last heard from other nodes, and remove them in case of * a time out, or clean all backbone gws if now is set. */ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now) { struct batadv_bla_backbone_gw *backbone_gw; struct hlist_node *node_tmp; struct hlist_head *head; struct batadv_hashtable *hash; spinlock_t *list_lock; /* protects write access to the hash lists */ int i; hash = bat_priv->bla.backbone_hash; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(backbone_gw, node_tmp, head, hash_entry) { if (now) goto purge_now; if (!batadv_has_timed_out(backbone_gw->lasttime, BATADV_BLA_BACKBONE_TIMEOUT)) continue; batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "%s(): backbone gw %pM timed out\n", __func__, backbone_gw->orig); purge_now: /* don't wait for the pending request anymore */ if (atomic_read(&backbone_gw->request_sent)) atomic_dec(&bat_priv->bla.num_requests); batadv_bla_del_backbone_claims(backbone_gw); hlist_del_rcu(&backbone_gw->hash_entry); batadv_backbone_gw_put(backbone_gw); } spin_unlock_bh(list_lock); } } /** * batadv_bla_purge_claims() - Remove claims after a timeout or immediately * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the selected primary interface, may be NULL if now is set * @now: whether the whole hash shall be wiped now * * Check when we heard last time from our own claims, and remove them in case of * a time out, or clean all claims if now is set */ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, int now) { struct batadv_bla_backbone_gw *backbone_gw; struct batadv_bla_claim *claim; struct hlist_head *head; struct batadv_hashtable *hash; int i; hash = bat_priv->bla.claim_hash; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { backbone_gw = batadv_bla_claim_get_backbone_gw(claim); if (now) goto purge_now; if (!batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr)) goto skip; if (!batadv_has_timed_out(claim->lasttime, BATADV_BLA_CLAIM_TIMEOUT)) goto skip; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): timed out.\n", __func__); purge_now: batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): %pM, vid %d\n", __func__, claim->addr, claim->vid); batadv_handle_unclaim(bat_priv, primary_if, backbone_gw->orig, claim->addr, claim->vid); skip: batadv_backbone_gw_put(backbone_gw); } rcu_read_unlock(); } } /** * batadv_bla_update_orig_address() - Update the backbone gateways when the own * originator address changes * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the new selected primary_if * @oldif: the old primary interface, may be NULL */ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct batadv_hard_iface *oldif) { struct batadv_bla_backbone_gw *backbone_gw; struct hlist_head *head; struct batadv_hashtable *hash; __be16 group; int i; /* reset bridge loop avoidance group id */ group = htons(crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN)); bat_priv->bla.claim_dest.group = group; /* purge everything when bridge loop avoidance is turned off */ if (!atomic_read(&bat_priv->bridge_loop_avoidance)) oldif = NULL; if (!oldif) { batadv_bla_purge_claims(bat_priv, NULL, 1); batadv_bla_purge_backbone_gw(bat_priv, 1); return; } hash = bat_priv->bla.backbone_hash; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { /* own orig still holds the old value. */ if (!batadv_compare_eth(backbone_gw->orig, oldif->net_dev->dev_addr)) continue; ether_addr_copy(backbone_gw->orig, primary_if->net_dev->dev_addr); /* send an announce frame so others will ask for our * claims and update their tables. */ batadv_bla_send_announce(bat_priv, backbone_gw); } rcu_read_unlock(); } } /** * batadv_bla_send_loopdetect() - send a loopdetect frame * @bat_priv: the bat priv with all the mesh interface information * @backbone_gw: the backbone gateway for which a loop should be detected * * To detect loops that the bridge loop avoidance can't handle, send a loop * detection packet on the backbone. Unlike other BLA frames, this frame will * be allowed on the mesh by other nodes. If it is received on the mesh, this * indicates that there is a loop. */ static void batadv_bla_send_loopdetect(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw) { batadv_dbg(BATADV_DBG_BLA, bat_priv, "Send loopdetect frame for vid %d\n", backbone_gw->vid); batadv_bla_send_claim(bat_priv, bat_priv->bla.loopdetect_addr, backbone_gw->vid, BATADV_CLAIM_TYPE_LOOPDETECT); } /** * batadv_bla_status_update() - purge bla interfaces if necessary * @net_dev: the mesh interface net device */ void batadv_bla_status_update(struct net_device *net_dev) { struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return; /* this function already purges everything when bla is disabled, * so just call that one. */ batadv_bla_update_orig_address(bat_priv, primary_if, primary_if); batadv_hardif_put(primary_if); } /** * batadv_bla_periodic_work() - performs periodic bla work * @work: kernel work struct * * periodic work to do: * * purge structures when they are too old * * send announcements */ static void batadv_bla_periodic_work(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv *bat_priv; struct batadv_priv_bla *priv_bla; struct hlist_head *head; struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hashtable *hash; struct batadv_hard_iface *primary_if; bool send_loopdetect = false; int i; delayed_work = to_delayed_work(work); priv_bla = container_of(delayed_work, struct batadv_priv_bla, work); bat_priv = container_of(priv_bla, struct batadv_priv, bla); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; batadv_bla_purge_claims(bat_priv, primary_if, 0); batadv_bla_purge_backbone_gw(bat_priv, 0); if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto out; if (atomic_dec_and_test(&bat_priv->bla.loopdetect_next)) { /* set a new random mac address for the next bridge loop * detection frames. Set the locally administered bit to avoid * collisions with users mac addresses. */ eth_random_addr(bat_priv->bla.loopdetect_addr); bat_priv->bla.loopdetect_addr[0] = 0xba; bat_priv->bla.loopdetect_addr[1] = 0xbe; bat_priv->bla.loopdetect_lasttime = jiffies; atomic_set(&bat_priv->bla.loopdetect_next, BATADV_BLA_LOOPDETECT_PERIODS); /* mark for sending loop detect on all VLANs */ send_loopdetect = true; } hash = bat_priv->bla.backbone_hash; if (!hash) goto out; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (!batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr)) continue; backbone_gw->lasttime = jiffies; batadv_bla_send_announce(bat_priv, backbone_gw); if (send_loopdetect) batadv_bla_send_loopdetect(bat_priv, backbone_gw); /* request_sent is only set after creation to avoid * problems when we are not yet known as backbone gw * in the backbone. * * We can reset this now after we waited some periods * to give bridge forward delays and bla group forming * some grace time. */ if (atomic_read(&backbone_gw->request_sent) == 0) continue; if (!atomic_dec_and_test(&backbone_gw->wait_periods)) continue; atomic_dec(&backbone_gw->bat_priv->bla.num_requests); atomic_set(&backbone_gw->request_sent, 0); } rcu_read_unlock(); } out: batadv_hardif_put(primary_if); queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); } /* The hash for claim and backbone hash receive the same key because they * are getting initialized by hash_new with the same key. Reinitializing * them with to different keys to allow nested locking without generating * lockdep warnings */ static struct lock_class_key batadv_claim_hash_lock_class_key; static struct lock_class_key batadv_backbone_hash_lock_class_key; /** * batadv_bla_init() - initialize all bla structures * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success, < 0 on error. */ int batadv_bla_init(struct batadv_priv *bat_priv) { int i; u8 claim_dest[ETH_ALEN] = {0xff, 0x43, 0x05, 0x00, 0x00, 0x00}; struct batadv_hard_iface *primary_if; u16 crc; unsigned long entrytime; spin_lock_init(&bat_priv->bla.bcast_duplist_lock); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla hash registering\n"); /* setting claim destination address */ memcpy(&bat_priv->bla.claim_dest.magic, claim_dest, 3); bat_priv->bla.claim_dest.type = 0; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if) { crc = crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN); bat_priv->bla.claim_dest.group = htons(crc); batadv_hardif_put(primary_if); } else { bat_priv->bla.claim_dest.group = 0; /* will be set later */ } /* initialize the duplicate list */ entrytime = jiffies - msecs_to_jiffies(BATADV_DUPLIST_TIMEOUT); for (i = 0; i < BATADV_DUPLIST_SIZE; i++) bat_priv->bla.bcast_duplist[i].entrytime = entrytime; bat_priv->bla.bcast_duplist_curr = 0; atomic_set(&bat_priv->bla.loopdetect_next, BATADV_BLA_LOOPDETECT_PERIODS); if (bat_priv->bla.claim_hash) return 0; bat_priv->bla.claim_hash = batadv_hash_new(128); if (!bat_priv->bla.claim_hash) return -ENOMEM; bat_priv->bla.backbone_hash = batadv_hash_new(32); if (!bat_priv->bla.backbone_hash) { batadv_hash_destroy(bat_priv->bla.claim_hash); return -ENOMEM; } batadv_hash_set_lock_class(bat_priv->bla.claim_hash, &batadv_claim_hash_lock_class_key); batadv_hash_set_lock_class(bat_priv->bla.backbone_hash, &batadv_backbone_hash_lock_class_key); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla hashes initialized\n"); INIT_DELAYED_WORK(&bat_priv->bla.work, batadv_bla_periodic_work); queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); return 0; } /** * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in * the header * @skb: skb pointing to fragmented socket buffers * @payload_ptr: Pointer to position inside the head buffer of the skb * marking the start of the data to be CRC'ed * * payload_ptr must always point to an address in the skb head buffer and not to * a fragment. * * Return: big endian crc32c of the checksummed data */ static __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) { unsigned int to = skb->len; unsigned int consumed = 0; struct skb_seq_state st; unsigned int from; unsigned int len; const u8 *data; u32 crc = 0; from = (unsigned int)(payload_ptr - skb->data); skb_prepare_seq_read(skb, from, to, &st); while ((len = skb_seq_read(consumed, &data, &st)) != 0) { crc = crc32c(crc, data, len); consumed += len; } return htonl(crc); } /** * batadv_bla_check_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the mesh interface information * @skb: contains the multicast packet to be checked * @payload_ptr: pointer to position inside the head buffer of the skb * marking the start of the data to be CRC'ed * @orig: originator mac address, NULL if unknown * * Check if it is on our broadcast list. Another gateway might have sent the * same packet because it is connected to the same backbone, so we have to * remove this duplicate. * * This is performed by checking the CRC, which will tell us * with a good chance that it is the same packet. If it is furthermore * sent by another host, drop it. We allow equal packets from * the same host however as this might be intended. * * Return: true if a packet is in the duplicate list, false otherwise. */ static bool batadv_bla_check_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb, u8 *payload_ptr, const u8 *orig) { struct batadv_bcast_duplist_entry *entry; bool ret = false; int i, curr; __be32 crc; /* calculate the crc ... */ crc = batadv_skb_crc32(skb, payload_ptr); spin_lock_bh(&bat_priv->bla.bcast_duplist_lock); for (i = 0; i < BATADV_DUPLIST_SIZE; i++) { curr = (bat_priv->bla.bcast_duplist_curr + i); curr %= BATADV_DUPLIST_SIZE; entry = &bat_priv->bla.bcast_duplist[curr]; /* we can stop searching if the entry is too old ; * later entries will be even older */ if (batadv_has_timed_out(entry->entrytime, BATADV_DUPLIST_TIMEOUT)) break; if (entry->crc != crc) continue; /* are the originators both known and not anonymous? */ if (orig && !is_zero_ether_addr(orig) && !is_zero_ether_addr(entry->orig)) { /* If known, check if the new frame came from * the same originator: * We are safe to take identical frames from the * same orig, if known, as multiplications in * the mesh are detected via the (orig, seqno) pair. * So we can be a bit more liberal here and allow * identical frames from the same orig which the source * host might have sent multiple times on purpose. */ if (batadv_compare_eth(entry->orig, orig)) continue; } /* this entry seems to match: same crc, not too old, * and from another gw. therefore return true to forbid it. */ ret = true; goto out; } /* not found, add a new entry (overwrite the oldest entry) * and allow it, its the first occurrence. */ curr = (bat_priv->bla.bcast_duplist_curr + BATADV_DUPLIST_SIZE - 1); curr %= BATADV_DUPLIST_SIZE; entry = &bat_priv->bla.bcast_duplist[curr]; entry->crc = crc; entry->entrytime = jiffies; /* known originator */ if (orig) ether_addr_copy(entry->orig, orig); /* anonymous originator */ else eth_zero_addr(entry->orig); bat_priv->bla.bcast_duplist_curr = curr; out: spin_unlock_bh(&bat_priv->bla.bcast_duplist_lock); return ret; } /** * batadv_bla_check_ucast_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the mesh interface information * @skb: contains the multicast packet to be checked, decapsulated from a * unicast_packet * * Check if it is on our broadcast list. Another gateway might have sent the * same packet because it is connected to the same backbone, so we have to * remove this duplicate. * * Return: true if a packet is in the duplicate list, false otherwise. */ static bool batadv_bla_check_ucast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { return batadv_bla_check_duplist(bat_priv, skb, (u8 *)skb->data, NULL); } /** * batadv_bla_check_bcast_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the mesh interface information * @skb: contains the bcast_packet to be checked * * Check if it is on our broadcast list. Another gateway might have sent the * same packet because it is connected to the same backbone, so we have to * remove this duplicate. * * Return: true if a packet is in the duplicate list, false otherwise. */ bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_bcast_packet *bcast_packet; u8 *payload_ptr; bcast_packet = (struct batadv_bcast_packet *)skb->data; payload_ptr = (u8 *)(bcast_packet + 1); return batadv_bla_check_duplist(bat_priv, skb, payload_ptr, bcast_packet->orig); } /** * batadv_bla_is_backbone_gw_orig() - Check if the originator is a gateway for * the VLAN identified by vid. * @bat_priv: the bat priv with all the mesh interface information * @orig: originator mac address * @vid: VLAN identifier * * Return: true if orig is a backbone for this vid, false otherwise. */ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; struct hlist_head *head; struct batadv_bla_backbone_gw *backbone_gw; int i; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) return false; if (!hash) return false; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (batadv_compare_eth(backbone_gw->orig, orig) && backbone_gw->vid == vid) { rcu_read_unlock(); return true; } } rcu_read_unlock(); } return false; } /** * batadv_bla_is_backbone_gw() - check if originator is a backbone gw for a VLAN * @skb: the frame to be checked * @orig_node: the orig_node of the frame * @hdr_size: maximum length of the frame * * Return: true if the orig_node is also a gateway on the mesh interface, * otherwise it returns false. */ bool batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size) { struct batadv_bla_backbone_gw *backbone_gw; unsigned short vid; if (!atomic_read(&orig_node->bat_priv->bridge_loop_avoidance)) return false; /* first, find out the vid. */ if (!pskb_may_pull(skb, hdr_size + ETH_HLEN)) return false; vid = batadv_get_vid(skb, hdr_size); /* see if this originator is a backbone gw for this VLAN */ backbone_gw = batadv_backbone_hash_find(orig_node->bat_priv, orig_node->orig, vid); if (!backbone_gw) return false; batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_bla_free() - free all bla structures * @bat_priv: the bat priv with all the mesh interface information * * for meshinterface free or module unload */ void batadv_bla_free(struct batadv_priv *bat_priv) { struct batadv_hard_iface *primary_if; cancel_delayed_work_sync(&bat_priv->bla.work); primary_if = batadv_primary_if_get_selected(bat_priv); if (bat_priv->bla.claim_hash) { batadv_bla_purge_claims(bat_priv, primary_if, 1); batadv_hash_destroy(bat_priv->bla.claim_hash); bat_priv->bla.claim_hash = NULL; } if (bat_priv->bla.backbone_hash) { batadv_bla_purge_backbone_gw(bat_priv, 1); batadv_hash_destroy(bat_priv->bla.backbone_hash); bat_priv->bla.backbone_hash = NULL; } batadv_hardif_put(primary_if); } /** * batadv_bla_loopdetect_check() - check and handle a detected loop * @bat_priv: the bat priv with all the mesh interface information * @skb: the packet to check * @primary_if: interface where the request came on * @vid: the VLAN ID of the frame * * Checks if this packet is a loop detect frame which has been sent by us, * throws an uevent and logs the event if that is the case. * * Return: true if it is a loop detect frame which is to be dropped, false * otherwise. */ static bool batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_hard_iface *primary_if, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; bool ret; ethhdr = eth_hdr(skb); /* Only check for the MAC address and skip more checks here for * performance reasons - this function is on the hotpath, after all. */ if (!batadv_compare_eth(ethhdr->h_source, bat_priv->bla.loopdetect_addr)) return false; /* If the packet came too late, don't forward it on the mesh * but don't consider that as loop. It might be a coincidence. */ if (batadv_has_timed_out(bat_priv->bla.loopdetect_lasttime, BATADV_BLA_LOOPDETECT_TIMEOUT)) return true; backbone_gw = batadv_bla_get_backbone_gw(bat_priv, primary_if->net_dev->dev_addr, vid, true); if (unlikely(!backbone_gw)) return true; ret = queue_work(batadv_event_workqueue, &backbone_gw->report_work); /* backbone_gw is unreferenced in the report work function * if queue_work() call was successful */ if (!ret) batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_bla_rx() - check packets coming from the mesh. * @bat_priv: the bat priv with all the mesh interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * @packet_type: the batman packet type this frame came in * * batadv_bla_rx avoidance checks if: * * we have to race for a claim * * if the frame is allowed on the LAN * * In these cases, the skb is further handled by this function * * Return: true if handled, otherwise it returns false and the caller shall * further process the skb. */ bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int packet_type) { struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_hard_iface *primary_if; bool own_claim; bool ret; ethhdr = eth_hdr(skb); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto handled; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto allow; if (batadv_bla_loopdetect_check(bat_priv, skb, primary_if, vid)) goto handled; if (unlikely(atomic_read(&bat_priv->bla.num_requests))) /* don't allow multicast packets while requests are in flight */ if (is_multicast_ether_addr(ethhdr->h_dest)) /* Both broadcast flooding or multicast-via-unicasts * delivery might send to multiple backbone gateways * sharing the same LAN and therefore need to coordinate * which backbone gateway forwards into the LAN, * by claiming the payload source address. * * Broadcast flooding and multicast-via-unicasts * delivery use the following two batman packet types. * Note: explicitly exclude BATADV_UNICAST_4ADDR, * as the DHCP gateway feature will send explicitly * to only one BLA gateway, so the claiming process * should be avoided there. */ if (packet_type == BATADV_BCAST || packet_type == BATADV_UNICAST) goto handled; /* potential duplicates from foreign BLA backbone gateways via * multicast-in-unicast packets */ if (is_multicast_ether_addr(ethhdr->h_dest) && packet_type == BATADV_UNICAST && batadv_bla_check_ucast_duplist(bat_priv, skb)) goto handled; ether_addr_copy(search_claim.addr, ethhdr->h_source); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); if (!claim) { bool local = batadv_is_my_client(bat_priv, ethhdr->h_source, vid); /* possible optimization: race for a claim */ /* No claim exists yet, claim it for us! */ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): Unclaimed MAC %pM found. Claim it. Local: %s\n", __func__, ethhdr->h_source, str_yes_no(local)); batadv_handle_claim(bat_priv, primary_if, primary_if->net_dev->dev_addr, ethhdr->h_source, vid); goto allow; } /* if it is our own claim ... */ backbone_gw = batadv_bla_claim_get_backbone_gw(claim); own_claim = batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr); batadv_backbone_gw_put(backbone_gw); if (own_claim) { /* ... allow it in any case */ claim->lasttime = jiffies; goto allow; } /* if it is a multicast ... */ if (is_multicast_ether_addr(ethhdr->h_dest) && (packet_type == BATADV_BCAST || packet_type == BATADV_UNICAST)) { /* ... drop it. the responsible gateway is in charge. * * We need to check packet type because with the gateway * feature, broadcasts (like DHCP requests) may be sent * using a unicast 4 address packet type. See comment above. */ goto handled; } else { /* seems the client considers us as its best gateway. * send a claim and update the claim table * immediately. */ batadv_handle_claim(bat_priv, primary_if, primary_if->net_dev->dev_addr, ethhdr->h_source, vid); goto allow; } allow: batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); ret = false; goto out; handled: kfree_skb(skb); ret = true; out: batadv_hardif_put(primary_if); batadv_claim_put(claim); return ret; } /** * batadv_bla_tx() - check packets going into the mesh * @bat_priv: the bat priv with all the mesh interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * * batadv_bla_tx checks if: * * a claim was received which has to be processed * * the frame is allowed on the mesh * * in these cases, the skb is further handled by this function. * * This call might reallocate skb data. * * Return: true if handled, otherwise it returns false and the caller shall * further process the skb. */ bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hard_iface *primary_if; bool client_roamed; bool ret = false; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto allow; if (batadv_bla_process_claim(bat_priv, primary_if, skb)) goto handled; ethhdr = eth_hdr(skb); if (unlikely(atomic_read(&bat_priv->bla.num_requests))) /* don't allow broadcasts while requests are in flight */ if (is_multicast_ether_addr(ethhdr->h_dest)) goto handled; ether_addr_copy(search_claim.addr, ethhdr->h_source); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); /* if no claim exists, allow it. */ if (!claim) goto allow; /* check if we are responsible. */ backbone_gw = batadv_bla_claim_get_backbone_gw(claim); client_roamed = batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr); batadv_backbone_gw_put(backbone_gw); if (client_roamed) { /* if yes, the client has roamed and we have * to unclaim it. */ if (batadv_has_timed_out(claim->lasttime, 100)) { /* only unclaim if the last claim entry is * older than 100 ms to make sure we really * have a roaming client here. */ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): Roaming client %pM detected. Unclaim it.\n", __func__, ethhdr->h_source); batadv_handle_unclaim(bat_priv, primary_if, primary_if->net_dev->dev_addr, ethhdr->h_source, vid); goto allow; } else { batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): Race for claim %pM detected. Drop packet.\n", __func__, ethhdr->h_source); goto handled; } } /* check if it is a multicast/broadcast frame */ if (is_multicast_ether_addr(ethhdr->h_dest)) { /* drop it. the responsible gateway has forwarded it into * the backbone network. */ goto handled; } else { /* we must allow it. at least if we are * responsible for the DESTINATION. */ goto allow; } allow: batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); ret = false; goto out; handled: ret = true; out: batadv_hardif_put(primary_if); batadv_claim_put(claim); return ret; } /** * batadv_bla_claim_dump_entry() - dump one entry of the claim table * to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @primary_if: primary interface * @claim: entry to dump * * Return: 0 or error code. */ static int batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_bla_claim *claim) { const u8 *primary_addr = primary_if->net_dev->dev_addr; u16 backbone_crc; bool is_own; void *hdr; int ret = -EINVAL; hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); is_own = batadv_compare_eth(claim->backbone_gw->orig, primary_addr); spin_lock_bh(&claim->backbone_gw->crc_lock); backbone_crc = claim->backbone_gw->crc; spin_unlock_bh(&claim->backbone_gw->crc_lock); if (is_own) if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_BLA_ADDRESS, ETH_ALEN, claim->addr) || nla_put_u16(msg, BATADV_ATTR_BLA_VID, claim->vid) || nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, claim->backbone_gw->orig) || nla_put_u16(msg, BATADV_ATTR_BLA_CRC, backbone_crc)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: return ret; } /** * batadv_bla_claim_dump_bucket() - dump one bucket of the claim table * to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @primary_if: primary interface * @hash: hash to dump * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: always 0. */ static int batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_hashtable *hash, unsigned int bucket, int *idx_skip) { struct batadv_bla_claim *claim; int idx = 0; int ret = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(claim, &hash->table[bucket], hash_entry) { if (idx++ < *idx_skip) continue; ret = batadv_bla_claim_dump_entry(msg, portid, cb, primary_if, claim); if (ret) { *idx_skip = idx - 1; goto unlock; } } *idx_skip = 0; unlock: spin_unlock_bh(&hash->list_locks[bucket]); return ret; } /** * batadv_bla_claim_dump() - dump claim table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * * Return: message length. */ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; struct net_device *mesh_iface; struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; int idx = cb->args[1]; int ret = 0; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); hash = bat_priv->bla.claim_hash; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } while (bucket < hash->size) { if (batadv_bla_claim_dump_bucket(msg, portid, cb, primary_if, hash, bucket, &idx)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(mesh_iface); return ret; } /** * batadv_bla_backbone_dump_entry() - dump one entry of the backbone table to a * netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @primary_if: primary interface * @backbone_gw: entry to dump * * Return: 0 or error code. */ static int batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_bla_backbone_gw *backbone_gw) { const u8 *primary_addr = primary_if->net_dev->dev_addr; u16 backbone_crc; bool is_own; int msecs; void *hdr; int ret = -EINVAL; hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); spin_lock_bh(&backbone_gw->crc_lock); backbone_crc = backbone_gw->crc; spin_unlock_bh(&backbone_gw->crc_lock); msecs = jiffies_to_msecs(jiffies - backbone_gw->lasttime); if (is_own) if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, backbone_gw->orig) || nla_put_u16(msg, BATADV_ATTR_BLA_VID, backbone_gw->vid) || nla_put_u16(msg, BATADV_ATTR_BLA_CRC, backbone_crc) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: return ret; } /** * batadv_bla_backbone_dump_bucket() - dump one bucket of the backbone table to * a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @primary_if: primary interface * @hash: hash to dump * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: always 0. */ static int batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_hashtable *hash, unsigned int bucket, int *idx_skip) { struct batadv_bla_backbone_gw *backbone_gw; int idx = 0; int ret = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(backbone_gw, &hash->table[bucket], hash_entry) { if (idx++ < *idx_skip) continue; ret = batadv_bla_backbone_dump_entry(msg, portid, cb, primary_if, backbone_gw); if (ret) { *idx_skip = idx - 1; goto unlock; } } *idx_skip = 0; unlock: spin_unlock_bh(&hash->list_locks[bucket]); return ret; } /** * batadv_bla_backbone_dump() - dump backbone table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * * Return: message length. */ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; struct net_device *mesh_iface; struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; int idx = cb->args[1]; int ret = 0; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); hash = bat_priv->bla.backbone_hash; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } while (bucket < hash->size) { if (batadv_bla_backbone_dump_bucket(msg, portid, cb, primary_if, hash, bucket, &idx)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(mesh_iface); return ret; } #ifdef CONFIG_BATMAN_ADV_DAT /** * batadv_bla_check_claim() - check if address is claimed * * @bat_priv: the bat priv with all the mesh interface information * @addr: mac address of which the claim status is checked * @vid: the VLAN ID * * addr is checked if this address is claimed by the local device itself. * * Return: true if bla is disabled or the mac is claimed by the device, * false if the device addr is already claimed by another gateway */ bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid) { struct batadv_bla_claim search_claim; struct batadv_bla_claim *claim = NULL; struct batadv_hard_iface *primary_if = NULL; bool ret = true; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) return ret; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return ret; /* First look if the mac address is claimed */ ether_addr_copy(search_claim.addr, addr); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); /* If there is a claim and we are not owner of the claim, * return false. */ if (claim) { if (!batadv_compare_eth(claim->backbone_gw->orig, primary_if->net_dev->dev_addr)) ret = false; batadv_claim_put(claim); } batadv_hardif_put(primary_if); return ret; } #endif
82 82 49 49 87 87 82 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 // SPDX-License-Identifier: GPL-2.0 /* * thermal.c - Generic Thermal Management Sysfs support. * * Copyright (C) 2008 Intel Corp * Copyright (C) 2008 Zhang Rui <rui.zhang@intel.com> * Copyright (C) 2008 Sujith Thomas <sujith.thomas@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/device.h> #include <linux/err.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/kdev_t.h> #include <linux/idr.h> #include <linux/thermal.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/of.h> #include <linux/suspend.h> #define CREATE_TRACE_POINTS #include "thermal_trace.h" #include "thermal_core.h" #include "thermal_hwmon.h" static DEFINE_IDA(thermal_tz_ida); static DEFINE_IDA(thermal_cdev_ida); static LIST_HEAD(thermal_tz_list); static LIST_HEAD(thermal_cdev_list); static LIST_HEAD(thermal_governor_list); static DEFINE_MUTEX(thermal_list_lock); static DEFINE_MUTEX(thermal_governor_lock); static struct thermal_governor *def_governor; static bool thermal_pm_suspended; /* * Governor section: set of functions to handle thermal governors * * Functions to help in the life cycle of thermal governors within * the thermal core and by the thermal governor code. */ static struct thermal_governor *__find_governor(const char *name) { struct thermal_governor *pos; if (!name || !name[0]) return def_governor; list_for_each_entry(pos, &thermal_governor_list, governor_list) if (!strncasecmp(name, pos->name, THERMAL_NAME_LENGTH)) return pos; return NULL; } /** * bind_previous_governor() - bind the previous governor of the thermal zone * @tz: a valid pointer to a struct thermal_zone_device * @failed_gov_name: the name of the governor that failed to register * * Register the previous governor of the thermal zone after a new * governor has failed to be bound. */ static void bind_previous_governor(struct thermal_zone_device *tz, const char *failed_gov_name) { if (tz->governor && tz->governor->bind_to_tz) { if (tz->governor->bind_to_tz(tz)) { dev_err(&tz->device, "governor %s failed to bind and the previous one (%s) failed to bind again, thermal zone %s has no governor\n", failed_gov_name, tz->governor->name, tz->type); tz->governor = NULL; } } } /** * thermal_set_governor() - Switch to another governor * @tz: a valid pointer to a struct thermal_zone_device * @new_gov: pointer to the new governor * * Change the governor of thermal zone @tz. * * Return: 0 on success, an error if the new governor's bind_to_tz() failed. */ static int thermal_set_governor(struct thermal_zone_device *tz, struct thermal_governor *new_gov) { int ret = 0; if (tz->governor && tz->governor->unbind_from_tz) tz->governor->unbind_from_tz(tz); if (new_gov && new_gov->bind_to_tz) { ret = new_gov->bind_to_tz(tz); if (ret) { bind_previous_governor(tz, new_gov->name); return ret; } } tz->governor = new_gov; return ret; } int thermal_register_governor(struct thermal_governor *governor) { int err; const char *name; struct thermal_zone_device *pos; if (!governor) return -EINVAL; guard(mutex)(&thermal_governor_lock); err = -EBUSY; if (!__find_governor(governor->name)) { bool match_default; err = 0; list_add(&governor->governor_list, &thermal_governor_list); match_default = !strncmp(governor->name, DEFAULT_THERMAL_GOVERNOR, THERMAL_NAME_LENGTH); if (!def_governor && match_default) def_governor = governor; } guard(mutex)(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) { /* * only thermal zones with specified tz->tzp->governor_name * may run with tz->govenor unset */ if (pos->governor) continue; name = pos->tzp->governor_name; if (!strncasecmp(name, governor->name, THERMAL_NAME_LENGTH)) { int ret; ret = thermal_set_governor(pos, governor); if (ret) dev_err(&pos->device, "Failed to set governor %s for thermal zone %s: %d\n", governor->name, pos->type, ret); } } return err; } void thermal_unregister_governor(struct thermal_governor *governor) { struct thermal_zone_device *pos; if (!governor) return; guard(mutex)(&thermal_governor_lock); if (!__find_governor(governor->name)) return; list_del(&governor->governor_list); guard(mutex)(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) { if (!strncasecmp(pos->governor->name, governor->name, THERMAL_NAME_LENGTH)) thermal_set_governor(pos, NULL); } } int thermal_zone_device_set_policy(struct thermal_zone_device *tz, char *policy) { struct thermal_governor *gov; int ret = -EINVAL; guard(mutex)(&thermal_governor_lock); guard(thermal_zone)(tz); gov = __find_governor(strim(policy)); if (gov) ret = thermal_set_governor(tz, gov); thermal_notify_tz_gov_change(tz, policy); return ret; } int thermal_build_list_of_policies(char *buf) { struct thermal_governor *pos; ssize_t count = 0; guard(mutex)(&thermal_governor_lock); list_for_each_entry(pos, &thermal_governor_list, governor_list) { count += sysfs_emit_at(buf, count, "%s ", pos->name); } count += sysfs_emit_at(buf, count, "\n"); return count; } static void __init thermal_unregister_governors(void) { struct thermal_governor **governor; for_each_governor_table(governor) thermal_unregister_governor(*governor); } static int __init thermal_register_governors(void) { int ret = 0; struct thermal_governor **governor; for_each_governor_table(governor) { ret = thermal_register_governor(*governor); if (ret) { pr_err("Failed to register governor: '%s'", (*governor)->name); break; } pr_info("Registered thermal governor '%s'", (*governor)->name); } if (ret) { struct thermal_governor **gov; for_each_governor_table(gov) { if (gov == governor) break; thermal_unregister_governor(*gov); } } return ret; } static int __thermal_zone_device_set_mode(struct thermal_zone_device *tz, enum thermal_device_mode mode) { if (tz->ops.change_mode) { int ret; ret = tz->ops.change_mode(tz, mode); if (ret) return ret; } tz->mode = mode; return 0; } static void thermal_zone_broken_disable(struct thermal_zone_device *tz) { struct thermal_trip_desc *td; dev_err(&tz->device, "Unable to get temperature, disabling!\n"); /* * This function only runs for enabled thermal zones, so no need to * check for the current mode. */ __thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED); thermal_notify_tz_disable(tz); for_each_trip_desc(tz, td) { if (td->trip.type == THERMAL_TRIP_CRITICAL && td->trip.temperature > THERMAL_TEMP_INVALID) { dev_crit(&tz->device, "Disabled thermal zone with critical trip point\n"); return; } } } /* * Zone update section: main control loop applied to each zone while monitoring * in polling mode. The monitoring is done using a workqueue. * Same update may be done on a zone by calling thermal_zone_device_update(). * * An update means: * - Non-critical trips will invoke the governor responsible for that zone; * - Hot trips will produce a notification to userspace; * - Critical trip point will cause a system shutdown. */ static void thermal_zone_device_set_polling(struct thermal_zone_device *tz, unsigned long delay) { if (delay > HZ) delay = round_jiffies_relative(delay); mod_delayed_work(system_freezable_power_efficient_wq, &tz->poll_queue, delay); } static void thermal_zone_recheck(struct thermal_zone_device *tz, int error) { if (error == -EAGAIN) { thermal_zone_device_set_polling(tz, THERMAL_RECHECK_DELAY); return; } /* * Print the message once to reduce log noise. It will be followed by * another one if the temperature cannot be determined after multiple * attempts. */ if (tz->recheck_delay_jiffies == THERMAL_RECHECK_DELAY) dev_info(&tz->device, "Temperature check failed (%d)\n", error); thermal_zone_device_set_polling(tz, tz->recheck_delay_jiffies); tz->recheck_delay_jiffies += max(tz->recheck_delay_jiffies >> 1, 1ULL); if (tz->recheck_delay_jiffies > THERMAL_MAX_RECHECK_DELAY) { thermal_zone_broken_disable(tz); /* * Restore the original recheck delay value to allow the thermal * zone to try to recover when it is reenabled by user space. */ tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY; } } static void monitor_thermal_zone(struct thermal_zone_device *tz) { if (tz->passive > 0 && tz->passive_delay_jiffies) thermal_zone_device_set_polling(tz, tz->passive_delay_jiffies); else if (tz->polling_delay_jiffies) thermal_zone_device_set_polling(tz, tz->polling_delay_jiffies); } static struct thermal_governor *thermal_get_tz_governor(struct thermal_zone_device *tz) { if (tz->governor) return tz->governor; return def_governor; } void thermal_governor_update_tz(struct thermal_zone_device *tz, enum thermal_notify_event reason) { if (!tz->governor || !tz->governor->update_tz) return; tz->governor->update_tz(tz, reason); } static void thermal_zone_device_halt(struct thermal_zone_device *tz, enum hw_protection_action action) { /* * poweroff_delay_ms must be a carefully profiled positive value. * Its a must for forced_emergency_poweroff_work to be scheduled. */ int poweroff_delay_ms = CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS; const char *msg = "Temperature too high"; dev_emerg(&tz->device, "%s: critical temperature reached\n", tz->type); __hw_protection_trigger(msg, poweroff_delay_ms, action); } void thermal_zone_device_critical(struct thermal_zone_device *tz) { thermal_zone_device_halt(tz, HWPROT_ACT_DEFAULT); } EXPORT_SYMBOL(thermal_zone_device_critical); void thermal_zone_device_critical_shutdown(struct thermal_zone_device *tz) { thermal_zone_device_halt(tz, HWPROT_ACT_SHUTDOWN); } void thermal_zone_device_critical_reboot(struct thermal_zone_device *tz) { thermal_zone_device_halt(tz, HWPROT_ACT_REBOOT); } static void handle_critical_trips(struct thermal_zone_device *tz, const struct thermal_trip *trip) { trace_thermal_zone_trip(tz, thermal_zone_trip_id(tz, trip), trip->type); if (trip->type == THERMAL_TRIP_CRITICAL) tz->ops.critical(tz); else if (tz->ops.hot) tz->ops.hot(tz); } static void move_trip_to_sorted_list(struct thermal_trip_desc *td, struct list_head *list) { struct thermal_trip_desc *entry; /* * Delete upfront and then add to make relocation within the same list * work. */ list_del(&td->list_node); /* Assume that the new entry is likely to be the last one. */ list_for_each_entry_reverse(entry, list, list_node) { if (entry->threshold <= td->threshold) { list_add(&td->list_node, &entry->list_node); return; } } list_add(&td->list_node, list); } static void move_to_trips_high(struct thermal_zone_device *tz, struct thermal_trip_desc *td) { td->threshold = td->trip.temperature; move_trip_to_sorted_list(td, &tz->trips_high); } static void move_to_trips_reached(struct thermal_zone_device *tz, struct thermal_trip_desc *td) { td->threshold = td->trip.temperature - td->trip.hysteresis; move_trip_to_sorted_list(td, &tz->trips_reached); } static void move_to_trips_invalid(struct thermal_zone_device *tz, struct thermal_trip_desc *td) { td->threshold = INT_MAX; list_move(&td->list_node, &tz->trips_invalid); } static void thermal_governor_trip_crossed(struct thermal_governor *governor, struct thermal_zone_device *tz, const struct thermal_trip *trip, bool upward) { if (trip->type == THERMAL_TRIP_HOT || trip->type == THERMAL_TRIP_CRITICAL) return; if (governor->trip_crossed) governor->trip_crossed(tz, trip, upward); } static void thermal_trip_crossed(struct thermal_zone_device *tz, struct thermal_trip_desc *td, struct thermal_governor *governor, bool upward) { const struct thermal_trip *trip = &td->trip; if (upward) { if (trip->type == THERMAL_TRIP_PASSIVE) tz->passive++; else if (trip->type == THERMAL_TRIP_CRITICAL || trip->type == THERMAL_TRIP_HOT) handle_critical_trips(tz, trip); thermal_notify_tz_trip_up(tz, trip); thermal_debug_tz_trip_up(tz, trip); } else { if (trip->type == THERMAL_TRIP_PASSIVE) { tz->passive--; WARN_ON(tz->passive < 0); } thermal_notify_tz_trip_down(tz, trip); thermal_debug_tz_trip_down(tz, trip); } thermal_governor_trip_crossed(governor, tz, trip, upward); } void thermal_zone_set_trip_hyst(struct thermal_zone_device *tz, struct thermal_trip *trip, int hyst) { struct thermal_trip_desc *td = trip_to_trip_desc(trip); WRITE_ONCE(trip->hysteresis, hyst); thermal_notify_tz_trip_change(tz, trip); /* * If the zone temperature is above or at the trip tmperature, the trip * is in the trips_reached list and its threshold is equal to its low * temperature. It needs to stay in that list, but its threshold needs * to be updated and the list ordering may need to be restored. */ if (tz->temperature >= td->threshold) move_to_trips_reached(tz, td); } void thermal_zone_set_trip_temp(struct thermal_zone_device *tz, struct thermal_trip *trip, int temp) { struct thermal_trip_desc *td = trip_to_trip_desc(trip); int old_temp = trip->temperature; if (old_temp == temp) return; WRITE_ONCE(trip->temperature, temp); thermal_notify_tz_trip_change(tz, trip); if (old_temp == THERMAL_TEMP_INVALID) { /* * The trip was invalid before the change, so move it to the * trips_high list regardless of the new temperature value * because there is no mitigation under way for it. If a * mitigation needs to be started, the trip will be moved to the * trips_reached list later. */ move_to_trips_high(tz, td); return; } if (temp == THERMAL_TEMP_INVALID) { /* * If the trip is in the trips_reached list, mitigation is under * way for it and it needs to be stopped because the trip is * effectively going away. */ if (tz->temperature >= td->threshold) thermal_trip_crossed(tz, td, thermal_get_tz_governor(tz), false); move_to_trips_invalid(tz, td); return; } /* * The trip stays on its current list, but its threshold needs to be * updated due to the temperature change and the list ordering may need * to be restored. */ if (tz->temperature >= td->threshold) move_to_trips_reached(tz, td); else move_to_trips_high(tz, td); } EXPORT_SYMBOL_GPL(thermal_zone_set_trip_temp); static void thermal_zone_handle_trips(struct thermal_zone_device *tz, struct thermal_governor *governor, int *low, int *high) { struct thermal_trip_desc *td, *next; LIST_HEAD(way_down_list); /* Check the trips that were below or at the zone temperature. */ list_for_each_entry_safe_reverse(td, next, &tz->trips_reached, list_node) { if (td->threshold <= tz->temperature) break; thermal_trip_crossed(tz, td, governor, false); /* * The current trips_high list needs to be processed before * adding new entries to it, so put them on a temporary list. */ list_move(&td->list_node, &way_down_list); } /* Check the trips that were previously above the zone temperature. */ list_for_each_entry_safe(td, next, &tz->trips_high, list_node) { if (td->threshold > tz->temperature) break; thermal_trip_crossed(tz, td, governor, true); move_to_trips_reached(tz, td); } /* Move all of the trips from the temporary list to trips_high. */ list_for_each_entry_safe(td, next, &way_down_list, list_node) move_to_trips_high(tz, td); if (!list_empty(&tz->trips_reached)) { td = list_last_entry(&tz->trips_reached, struct thermal_trip_desc, list_node); /* * Set the "low" value below the current trip threshold in case * the zone temperature is at that threshold and stays there, * which would trigger a new interrupt immediately in vain. */ *low = td->threshold - 1; } if (!list_empty(&tz->trips_high)) { td = list_first_entry(&tz->trips_high, struct thermal_trip_desc, list_node); *high = td->threshold; } } void __thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event) { struct thermal_governor *governor = thermal_get_tz_governor(tz); int low = -INT_MAX, high = INT_MAX; int temp, ret; if (tz->state != TZ_STATE_READY || tz->mode != THERMAL_DEVICE_ENABLED) return; ret = __thermal_zone_get_temp(tz, &temp); if (ret) { thermal_zone_recheck(tz, ret); return; } else if (temp <= THERMAL_TEMP_INVALID) { /* * Special case: No valid temperature value is available, but * the zone owner does not want the core to do anything about * it. Continue regular zone polling if needed, so that this * function can be called again, but skip everything else. */ goto monitor; } tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY; tz->last_temperature = tz->temperature; tz->temperature = temp; trace_thermal_temperature(tz); thermal_genl_sampling_temp(tz->id, temp); tz->notify_event = event; thermal_zone_handle_trips(tz, governor, &low, &high); thermal_thresholds_handle(tz, &low, &high); thermal_zone_set_trips(tz, low, high); if (governor->manage) governor->manage(tz); thermal_debug_update_trip_stats(tz); monitor: monitor_thermal_zone(tz); } static int thermal_zone_device_set_mode(struct thermal_zone_device *tz, enum thermal_device_mode mode) { int ret; guard(thermal_zone)(tz); /* do nothing if mode isn't changing */ if (mode == tz->mode) return 0; ret = __thermal_zone_device_set_mode(tz, mode); if (ret) return ret; __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); if (mode == THERMAL_DEVICE_ENABLED) thermal_notify_tz_enable(tz); else thermal_notify_tz_disable(tz); return 0; } int thermal_zone_device_enable(struct thermal_zone_device *tz) { return thermal_zone_device_set_mode(tz, THERMAL_DEVICE_ENABLED); } EXPORT_SYMBOL_GPL(thermal_zone_device_enable); int thermal_zone_device_disable(struct thermal_zone_device *tz) { return thermal_zone_device_set_mode(tz, THERMAL_DEVICE_DISABLED); } EXPORT_SYMBOL_GPL(thermal_zone_device_disable); static bool thermal_zone_is_present(struct thermal_zone_device *tz) { return !list_empty(&tz->node); } void thermal_zone_device_update(struct thermal_zone_device *tz, enum thermal_notify_event event) { guard(thermal_zone)(tz); if (thermal_zone_is_present(tz)) __thermal_zone_device_update(tz, event); } EXPORT_SYMBOL_GPL(thermal_zone_device_update); int for_each_thermal_governor(int (*cb)(struct thermal_governor *, void *), void *data) { struct thermal_governor *gov; guard(mutex)(&thermal_governor_lock); list_for_each_entry(gov, &thermal_governor_list, governor_list) { int ret; ret = cb(gov, data); if (ret) return ret; } return 0; } int for_each_thermal_cooling_device(int (*cb)(struct thermal_cooling_device *, void *), void *data) { struct thermal_cooling_device *cdev; guard(mutex)(&thermal_list_lock); list_for_each_entry(cdev, &thermal_cdev_list, node) { int ret; ret = cb(cdev, data); if (ret) return ret; } return 0; } int for_each_thermal_zone(int (*cb)(struct thermal_zone_device *, void *), void *data) { struct thermal_zone_device *tz; guard(mutex)(&thermal_list_lock); list_for_each_entry(tz, &thermal_tz_list, node) { int ret; ret = cb(tz, data); if (ret) return ret; } return 0; } struct thermal_zone_device *thermal_zone_get_by_id(int id) { struct thermal_zone_device *tz; guard(mutex)(&thermal_list_lock); list_for_each_entry(tz, &thermal_tz_list, node) { if (tz->id == id) { get_device(&tz->device); return tz; } } return NULL; } /* * Device management section: cooling devices, zones devices, and binding * * Set of functions provided by the thermal core for: * - cooling devices lifecycle: registration, unregistration, * binding, and unbinding. * - thermal zone devices lifecycle: registration, unregistration, * binding, and unbinding. */ static int thermal_instance_add(struct thermal_instance *new_instance, struct thermal_cooling_device *cdev, struct thermal_trip_desc *td) { struct thermal_instance *instance; list_for_each_entry(instance, &td->thermal_instances, trip_node) { if (instance->cdev == cdev) return -EEXIST; } list_add_tail(&new_instance->trip_node, &td->thermal_instances); guard(cooling_dev)(cdev); list_add_tail(&new_instance->cdev_node, &cdev->thermal_instances); return 0; } /** * thermal_bind_cdev_to_trip - bind a cooling device to a thermal zone * @tz: pointer to struct thermal_zone_device * @td: descriptor of the trip point to bind @cdev to * @cdev: pointer to struct thermal_cooling_device * @cool_spec: cooling specification for the trip point and @cdev * * This interface function bind a thermal cooling device to the certain trip * point of a thermal zone device. * This function is usually called in the thermal zone device .bind callback. * * Return: 0 on success, the proper error value otherwise. */ static int thermal_bind_cdev_to_trip(struct thermal_zone_device *tz, struct thermal_trip_desc *td, struct thermal_cooling_device *cdev, struct cooling_spec *cool_spec) { struct thermal_instance *dev; bool upper_no_limit; int result; /* lower default 0, upper default max_state */ if (cool_spec->lower == THERMAL_NO_LIMIT) cool_spec->lower = 0; if (cool_spec->upper == THERMAL_NO_LIMIT) { cool_spec->upper = cdev->max_state; upper_no_limit = true; } else { upper_no_limit = false; } if (cool_spec->lower > cool_spec->upper || cool_spec->upper > cdev->max_state) return -EINVAL; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return -ENOMEM; dev->cdev = cdev; dev->trip = &td->trip; dev->upper = cool_spec->upper; dev->upper_no_limit = upper_no_limit; dev->lower = cool_spec->lower; dev->target = THERMAL_NO_TARGET; dev->weight = cool_spec->weight; result = ida_alloc(&tz->ida, GFP_KERNEL); if (result < 0) goto free_mem; dev->id = result; sprintf(dev->name, "cdev%d", dev->id); result = sysfs_create_link(&tz->device.kobj, &cdev->device.kobj, dev->name); if (result) goto release_ida; snprintf(dev->attr_name, sizeof(dev->attr_name), "cdev%d_trip_point", dev->id); sysfs_attr_init(&dev->attr.attr); dev->attr.attr.name = dev->attr_name; dev->attr.attr.mode = 0444; dev->attr.show = trip_point_show; result = device_create_file(&tz->device, &dev->attr); if (result) goto remove_symbol_link; snprintf(dev->weight_attr_name, sizeof(dev->weight_attr_name), "cdev%d_weight", dev->id); sysfs_attr_init(&dev->weight_attr.attr); dev->weight_attr.attr.name = dev->weight_attr_name; dev->weight_attr.attr.mode = S_IWUSR | S_IRUGO; dev->weight_attr.show = weight_show; dev->weight_attr.store = weight_store; result = device_create_file(&tz->device, &dev->weight_attr); if (result) goto remove_trip_file; result = thermal_instance_add(dev, cdev, td); if (result) goto remove_weight_file; thermal_governor_update_tz(tz, THERMAL_TZ_BIND_CDEV); return 0; remove_weight_file: device_remove_file(&tz->device, &dev->weight_attr); remove_trip_file: device_remove_file(&tz->device, &dev->attr); remove_symbol_link: sysfs_remove_link(&tz->device.kobj, dev->name); release_ida: ida_free(&tz->ida, dev->id); free_mem: kfree(dev); return result; } static void thermal_instance_delete(struct thermal_instance *instance) { list_del(&instance->trip_node); guard(cooling_dev)(instance->cdev); list_del(&instance->cdev_node); } /** * thermal_unbind_cdev_from_trip - unbind a cooling device from a thermal zone. * @tz: pointer to a struct thermal_zone_device. * @td: descriptor of the trip point to unbind @cdev from * @cdev: pointer to a struct thermal_cooling_device. * * This interface function unbind a thermal cooling device from the certain * trip point of a thermal zone device. * This function is usually called in the thermal zone device .unbind callback. */ static void thermal_unbind_cdev_from_trip(struct thermal_zone_device *tz, struct thermal_trip_desc *td, struct thermal_cooling_device *cdev) { struct thermal_instance *pos, *next; list_for_each_entry_safe(pos, next, &td->thermal_instances, trip_node) { if (pos->cdev == cdev) { thermal_instance_delete(pos); goto unbind; } } return; unbind: thermal_governor_update_tz(tz, THERMAL_TZ_UNBIND_CDEV); device_remove_file(&tz->device, &pos->weight_attr); device_remove_file(&tz->device, &pos->attr); sysfs_remove_link(&tz->device.kobj, pos->name); ida_free(&tz->ida, pos->id); kfree(pos); } static void thermal_release(struct device *dev) { struct thermal_zone_device *tz; struct thermal_cooling_device *cdev; if (!strncmp(dev_name(dev), "thermal_zone", sizeof("thermal_zone") - 1)) { tz = to_thermal_zone(dev); thermal_zone_destroy_device_groups(tz); mutex_destroy(&tz->lock); complete(&tz->removal); } else if (!strncmp(dev_name(dev), "cooling_device", sizeof("cooling_device") - 1)) { cdev = to_cooling_device(dev); thermal_cooling_device_destroy_sysfs(cdev); kfree_const(cdev->type); ida_free(&thermal_cdev_ida, cdev->id); kfree(cdev); } } static struct class *thermal_class; static inline void print_bind_err_msg(struct thermal_zone_device *tz, const struct thermal_trip_desc *td, struct thermal_cooling_device *cdev, int ret) { dev_err(&tz->device, "binding cdev %s to trip %d failed: %d\n", cdev->type, thermal_zone_trip_id(tz, &td->trip), ret); } static bool __thermal_zone_cdev_bind(struct thermal_zone_device *tz, struct thermal_cooling_device *cdev) { struct thermal_trip_desc *td; bool update_tz = false; if (!tz->ops.should_bind) return false; for_each_trip_desc(tz, td) { struct cooling_spec c = { .upper = THERMAL_NO_LIMIT, .lower = THERMAL_NO_LIMIT, .weight = THERMAL_WEIGHT_DEFAULT }; int ret; if (!tz->ops.should_bind(tz, &td->trip, cdev, &c)) continue; ret = thermal_bind_cdev_to_trip(tz, td, cdev, &c); if (ret) { print_bind_err_msg(tz, td, cdev, ret); continue; } update_tz = true; } return update_tz; } static void thermal_zone_cdev_bind(struct thermal_zone_device *tz, struct thermal_cooling_device *cdev) { guard(thermal_zone)(tz); if (__thermal_zone_cdev_bind(tz, cdev)) __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); } static void thermal_cooling_device_init_complete(struct thermal_cooling_device *cdev) { struct thermal_zone_device *tz; guard(mutex)(&thermal_list_lock); list_add(&cdev->node, &thermal_cdev_list); list_for_each_entry(tz, &thermal_tz_list, node) thermal_zone_cdev_bind(tz, cdev); } /** * __thermal_cooling_device_register() - register a new thermal cooling device * @np: a pointer to a device tree node. * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * It also gives the opportunity to link the cooling device to a device tree * node, so that it can be bound to a thermal zone created out of device tree. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ static struct thermal_cooling_device * __thermal_cooling_device_register(struct device_node *np, const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { struct thermal_cooling_device *cdev; unsigned long current_state; int id, ret; if (!ops || !ops->get_max_state || !ops->get_cur_state || !ops->set_cur_state) return ERR_PTR(-EINVAL); if (!thermal_class) return ERR_PTR(-ENODEV); cdev = kzalloc(sizeof(*cdev), GFP_KERNEL); if (!cdev) return ERR_PTR(-ENOMEM); ret = ida_alloc(&thermal_cdev_ida, GFP_KERNEL); if (ret < 0) goto out_kfree_cdev; cdev->id = ret; id = ret; cdev->type = kstrdup_const(type ? type : "", GFP_KERNEL); if (!cdev->type) { ret = -ENOMEM; goto out_ida_remove; } mutex_init(&cdev->lock); INIT_LIST_HEAD(&cdev->thermal_instances); cdev->np = np; cdev->ops = ops; cdev->updated = false; cdev->device.class = thermal_class; cdev->devdata = devdata; ret = cdev->ops->get_max_state(cdev, &cdev->max_state); if (ret) goto out_cdev_type; /* * The cooling device's current state is only needed for debug * initialization below, so a failure to get it does not cause * the entire cooling device initialization to fail. However, * the debug will not work for the device if its initial state * cannot be determined and drivers are responsible for ensuring * that this will not happen. */ ret = cdev->ops->get_cur_state(cdev, &current_state); if (ret) current_state = ULONG_MAX; thermal_cooling_device_setup_sysfs(cdev); ret = dev_set_name(&cdev->device, "cooling_device%d", cdev->id); if (ret) goto out_cooling_dev; ret = device_register(&cdev->device); if (ret) { /* thermal_release() handles rest of the cleanup */ put_device(&cdev->device); return ERR_PTR(ret); } if (current_state <= cdev->max_state) thermal_debug_cdev_add(cdev, current_state); thermal_cooling_device_init_complete(cdev); return cdev; out_cooling_dev: thermal_cooling_device_destroy_sysfs(cdev); out_cdev_type: kfree_const(cdev->type); out_ida_remove: ida_free(&thermal_cdev_ida, id); out_kfree_cdev: kfree(cdev); return ERR_PTR(ret); } /** * thermal_cooling_device_register() - register a new thermal cooling device * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ struct thermal_cooling_device * thermal_cooling_device_register(const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return __thermal_cooling_device_register(NULL, type, devdata, ops); } EXPORT_SYMBOL_GPL(thermal_cooling_device_register); /** * thermal_of_cooling_device_register() - register an OF thermal cooling device * @np: a pointer to a device tree node. * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This function will register a cooling device with device tree node reference. * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ struct thermal_cooling_device * thermal_of_cooling_device_register(struct device_node *np, const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { return __thermal_cooling_device_register(np, type, devdata, ops); } EXPORT_SYMBOL_GPL(thermal_of_cooling_device_register); static void thermal_cooling_device_release(struct device *dev, void *res) { thermal_cooling_device_unregister( *(struct thermal_cooling_device **)res); } /** * devm_thermal_of_cooling_device_register() - register an OF thermal cooling * device * @dev: a valid struct device pointer of a sensor device. * @np: a pointer to a device tree node. * @type: the thermal cooling device type. * @devdata: device private data. * @ops: standard thermal cooling devices callbacks. * * This function will register a cooling device with device tree node reference. * This interface function adds a new thermal cooling device (fan/processor/...) * to /sys/class/thermal/ folder as cooling_device[0-*]. It tries to bind itself * to all the thermal zone devices registered at the same time. * * Return: a pointer to the created struct thermal_cooling_device or an * ERR_PTR. Caller must check return value with IS_ERR*() helpers. */ struct thermal_cooling_device * devm_thermal_of_cooling_device_register(struct device *dev, struct device_node *np, const char *type, void *devdata, const struct thermal_cooling_device_ops *ops) { struct thermal_cooling_device **ptr, *tcd; ptr = devres_alloc(thermal_cooling_device_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) return ERR_PTR(-ENOMEM); tcd = __thermal_cooling_device_register(np, type, devdata, ops); if (IS_ERR(tcd)) { devres_free(ptr); return tcd; } *ptr = tcd; devres_add(dev, ptr); return tcd; } EXPORT_SYMBOL_GPL(devm_thermal_of_cooling_device_register); static bool thermal_cooling_device_present(struct thermal_cooling_device *cdev) { struct thermal_cooling_device *pos = NULL; list_for_each_entry(pos, &thermal_cdev_list, node) { if (pos == cdev) return true; } return false; } /** * thermal_cooling_device_update - Update a cooling device object * @cdev: Target cooling device. * * Update @cdev to reflect a change of the underlying hardware or platform. * * Must be called when the maximum cooling state of @cdev becomes invalid and so * its .get_max_state() callback needs to be run to produce the new maximum * cooling state value. */ void thermal_cooling_device_update(struct thermal_cooling_device *cdev) { struct thermal_instance *ti; unsigned long state; if (IS_ERR_OR_NULL(cdev)) return; /* * Hold thermal_list_lock throughout the update to prevent the device * from going away while being updated. */ guard(mutex)(&thermal_list_lock); if (!thermal_cooling_device_present(cdev)) return; /* * Update under the cdev lock to prevent the state from being set beyond * the new limit concurrently. */ guard(cooling_dev)(cdev); if (cdev->ops->get_max_state(cdev, &cdev->max_state)) return; thermal_cooling_device_stats_reinit(cdev); list_for_each_entry(ti, &cdev->thermal_instances, cdev_node) { if (ti->upper == cdev->max_state) continue; if (ti->upper < cdev->max_state) { if (ti->upper_no_limit) ti->upper = cdev->max_state; continue; } ti->upper = cdev->max_state; if (ti->lower > ti->upper) ti->lower = ti->upper; if (ti->target == THERMAL_NO_TARGET) continue; if (ti->target > ti->upper) ti->target = ti->upper; } if (cdev->ops->get_cur_state(cdev, &state) || state > cdev->max_state) return; thermal_cooling_device_stats_update(cdev, state); } EXPORT_SYMBOL_GPL(thermal_cooling_device_update); static void __thermal_zone_cdev_unbind(struct thermal_zone_device *tz, struct thermal_cooling_device *cdev) { struct thermal_trip_desc *td; for_each_trip_desc(tz, td) thermal_unbind_cdev_from_trip(tz, td, cdev); } static void thermal_zone_cdev_unbind(struct thermal_zone_device *tz, struct thermal_cooling_device *cdev) { guard(thermal_zone)(tz); __thermal_zone_cdev_unbind(tz, cdev); } static bool thermal_cooling_device_exit(struct thermal_cooling_device *cdev) { struct thermal_zone_device *tz; guard(mutex)(&thermal_list_lock); if (!thermal_cooling_device_present(cdev)) return false; list_del(&cdev->node); list_for_each_entry(tz, &thermal_tz_list, node) thermal_zone_cdev_unbind(tz, cdev); return true; } /** * thermal_cooling_device_unregister() - removes a thermal cooling device * @cdev: Thermal cooling device to remove. */ void thermal_cooling_device_unregister(struct thermal_cooling_device *cdev) { if (!cdev) return; thermal_debug_cdev_remove(cdev); if (thermal_cooling_device_exit(cdev)) device_unregister(&cdev->device); } EXPORT_SYMBOL_GPL(thermal_cooling_device_unregister); int thermal_zone_get_crit_temp(struct thermal_zone_device *tz, int *temp) { const struct thermal_trip_desc *td; int ret = -EINVAL; if (tz->ops.get_crit_temp) return tz->ops.get_crit_temp(tz, temp); guard(thermal_zone)(tz); for_each_trip_desc(tz, td) { const struct thermal_trip *trip = &td->trip; if (trip->type == THERMAL_TRIP_CRITICAL) { *temp = trip->temperature; ret = 0; break; } } return ret; } EXPORT_SYMBOL_GPL(thermal_zone_get_crit_temp); static void thermal_zone_device_check(struct work_struct *work) { struct thermal_zone_device *tz = container_of(work, struct thermal_zone_device, poll_queue.work); thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); } static void thermal_zone_device_init(struct thermal_zone_device *tz) { struct thermal_trip_desc *td, *next; INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_check); tz->temperature = THERMAL_TEMP_INIT; tz->passive = 0; tz->prev_low_trip = -INT_MAX; tz->prev_high_trip = INT_MAX; for_each_trip_desc(tz, td) { struct thermal_instance *instance; list_for_each_entry(instance, &td->thermal_instances, trip_node) instance->initialized = false; } /* * At this point, all valid trips need to be moved to trips_high so that * mitigation can be started if the zone temperature is above them. */ list_for_each_entry_safe(td, next, &tz->trips_invalid, list_node) { if (td->trip.temperature != THERMAL_TEMP_INVALID) move_to_trips_high(tz, td); } /* The trips_reached list may not be empty during system resume. */ list_for_each_entry_safe(td, next, &tz->trips_reached, list_node) { if (td->trip.temperature == THERMAL_TEMP_INVALID) move_to_trips_invalid(tz, td); else move_to_trips_high(tz, td); } } static int thermal_zone_init_governor(struct thermal_zone_device *tz) { struct thermal_governor *governor; guard(mutex)(&thermal_governor_lock); if (tz->tzp) governor = __find_governor(tz->tzp->governor_name); else governor = def_governor; return thermal_set_governor(tz, governor); } static void thermal_zone_init_complete(struct thermal_zone_device *tz) { struct thermal_cooling_device *cdev; guard(mutex)(&thermal_list_lock); list_add_tail(&tz->node, &thermal_tz_list); guard(thermal_zone)(tz); /* Bind cooling devices for this zone. */ list_for_each_entry(cdev, &thermal_cdev_list, node) __thermal_zone_cdev_bind(tz, cdev); tz->state &= ~TZ_STATE_FLAG_INIT; /* * If system suspend or resume is in progress at this point, the * new thermal zone needs to be marked as suspended because * thermal_pm_notify() has run already. */ if (thermal_pm_suspended) tz->state |= TZ_STATE_FLAG_SUSPENDED; __thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED); } /** * thermal_zone_device_register_with_trips() - register a new thermal zone device * @type: the thermal zone device type * @trips: a pointer to an array of thermal trips * @num_trips: the number of trip points the thermal zone support * @devdata: private device data * @ops: standard thermal zone device callbacks * @tzp: thermal zone platform parameters * @passive_delay: number of milliseconds to wait between polls when * performing passive cooling * @polling_delay: number of milliseconds to wait between polls when checking * whether trip points have been crossed (0 for interrupt * driven systems) * * This interface function adds a new thermal zone device (sensor) to * /sys/class/thermal folder as thermal_zone[0-*]. It tries to bind all the * thermal cooling devices registered at the same time. * thermal_zone_device_unregister() must be called when the device is no * longer needed. The passive cooling depends on the .get_trend() return value. * * Return: a pointer to the created struct thermal_zone_device or an * in case of error, an ERR_PTR. Caller must check return value with * IS_ERR*() helpers. */ struct thermal_zone_device * thermal_zone_device_register_with_trips(const char *type, const struct thermal_trip *trips, int num_trips, void *devdata, const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp, unsigned int passive_delay, unsigned int polling_delay) { const struct thermal_trip *trip = trips; struct thermal_zone_device *tz; struct thermal_trip_desc *td; int id; int result; if (!type || strlen(type) == 0) { pr_err("No thermal zone type defined\n"); return ERR_PTR(-EINVAL); } if (strlen(type) >= THERMAL_NAME_LENGTH) { pr_err("Thermal zone name (%s) too long, should be under %d chars\n", type, THERMAL_NAME_LENGTH); return ERR_PTR(-EINVAL); } if (num_trips < 0) { pr_err("Incorrect number of thermal trips\n"); return ERR_PTR(-EINVAL); } if (!ops || !ops->get_temp) { pr_err("Thermal zone device ops not defined or invalid\n"); return ERR_PTR(-EINVAL); } if (num_trips > 0 && !trips) return ERR_PTR(-EINVAL); if (polling_delay && passive_delay > polling_delay) return ERR_PTR(-EINVAL); if (!thermal_class) return ERR_PTR(-ENODEV); tz = kzalloc(struct_size(tz, trips, num_trips), GFP_KERNEL); if (!tz) return ERR_PTR(-ENOMEM); if (tzp) { tz->tzp = kmemdup(tzp, sizeof(*tzp), GFP_KERNEL); if (!tz->tzp) { result = -ENOMEM; goto free_tz; } } INIT_LIST_HEAD(&tz->node); INIT_LIST_HEAD(&tz->trips_high); INIT_LIST_HEAD(&tz->trips_reached); INIT_LIST_HEAD(&tz->trips_invalid); ida_init(&tz->ida); mutex_init(&tz->lock); init_completion(&tz->removal); init_completion(&tz->resume); id = ida_alloc(&thermal_tz_ida, GFP_KERNEL); if (id < 0) { result = id; goto free_tzp; } tz->id = id; strscpy(tz->type, type, sizeof(tz->type)); tz->ops = *ops; if (!tz->ops.critical) tz->ops.critical = thermal_zone_device_critical; tz->device.class = thermal_class; tz->devdata = devdata; tz->num_trips = num_trips; for_each_trip_desc(tz, td) { td->trip = *trip++; INIT_LIST_HEAD(&td->thermal_instances); INIT_LIST_HEAD(&td->list_node); /* * Mark all thresholds as invalid to start with even though * this only matters for the trips that start as invalid and * become valid later. */ move_to_trips_invalid(tz, td); } tz->polling_delay_jiffies = msecs_to_jiffies(polling_delay); tz->passive_delay_jiffies = msecs_to_jiffies(passive_delay); tz->recheck_delay_jiffies = THERMAL_RECHECK_DELAY; tz->state = TZ_STATE_FLAG_INIT; result = dev_set_name(&tz->device, "thermal_zone%d", tz->id); if (result) goto remove_id; thermal_zone_device_init(tz); result = thermal_zone_init_governor(tz); if (result) goto remove_id; /* sys I/F */ /* Add nodes that are always present via .groups */ result = thermal_zone_create_device_groups(tz); if (result) goto remove_id; result = device_register(&tz->device); if (result) goto release_device; if (!tz->tzp || !tz->tzp->no_hwmon) { result = thermal_add_hwmon_sysfs(tz); if (result) goto unregister; } result = thermal_thresholds_init(tz); if (result) goto remove_hwmon; thermal_zone_init_complete(tz); thermal_notify_tz_create(tz); thermal_debug_tz_add(tz); return tz; remove_hwmon: thermal_remove_hwmon_sysfs(tz); unregister: device_del(&tz->device); release_device: put_device(&tz->device); remove_id: ida_free(&thermal_tz_ida, id); free_tzp: kfree(tz->tzp); free_tz: kfree(tz); return ERR_PTR(result); } EXPORT_SYMBOL_GPL(thermal_zone_device_register_with_trips); struct thermal_zone_device *thermal_tripless_zone_device_register( const char *type, void *devdata, const struct thermal_zone_device_ops *ops, const struct thermal_zone_params *tzp) { return thermal_zone_device_register_with_trips(type, NULL, 0, devdata, ops, tzp, 0, 0); } EXPORT_SYMBOL_GPL(thermal_tripless_zone_device_register); void *thermal_zone_device_priv(struct thermal_zone_device *tzd) { return tzd->devdata; } EXPORT_SYMBOL_GPL(thermal_zone_device_priv); const char *thermal_zone_device_type(struct thermal_zone_device *tzd) { return tzd->type; } EXPORT_SYMBOL_GPL(thermal_zone_device_type); int thermal_zone_device_id(struct thermal_zone_device *tzd) { return tzd->id; } EXPORT_SYMBOL_GPL(thermal_zone_device_id); struct device *thermal_zone_device(struct thermal_zone_device *tzd) { return &tzd->device; } EXPORT_SYMBOL_GPL(thermal_zone_device); static bool thermal_zone_exit(struct thermal_zone_device *tz) { struct thermal_cooling_device *cdev; guard(mutex)(&thermal_list_lock); if (list_empty(&tz->node)) return false; guard(thermal_zone)(tz); tz->state |= TZ_STATE_FLAG_EXIT; list_del_init(&tz->node); /* Unbind all cdevs associated with this thermal zone. */ list_for_each_entry(cdev, &thermal_cdev_list, node) __thermal_zone_cdev_unbind(tz, cdev); return true; } /** * thermal_zone_device_unregister - removes the registered thermal zone device * @tz: the thermal zone device to remove */ void thermal_zone_device_unregister(struct thermal_zone_device *tz) { if (!tz) return; thermal_debug_tz_remove(tz); if (!thermal_zone_exit(tz)) return; cancel_delayed_work_sync(&tz->poll_queue); thermal_set_governor(tz, NULL); thermal_thresholds_exit(tz); thermal_remove_hwmon_sysfs(tz); ida_free(&thermal_tz_ida, tz->id); ida_destroy(&tz->ida); device_del(&tz->device); put_device(&tz->device); thermal_notify_tz_delete(tz); wait_for_completion(&tz->removal); kfree(tz->tzp); kfree(tz); } EXPORT_SYMBOL_GPL(thermal_zone_device_unregister); /** * thermal_zone_get_zone_by_name() - search for a zone and returns its ref * @name: thermal zone name to fetch the temperature * * When only one zone is found with the passed name, returns a reference to it. * * Return: On success returns a reference to an unique thermal zone with * matching name equals to @name, an ERR_PTR otherwise (-EINVAL for invalid * paramenters, -ENODEV for not found and -EEXIST for multiple matches). */ struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name) { struct thermal_zone_device *pos = NULL, *ref = ERR_PTR(-EINVAL); unsigned int found = 0; if (!name) return ERR_PTR(-EINVAL); guard(mutex)(&thermal_list_lock); list_for_each_entry(pos, &thermal_tz_list, node) if (!strncasecmp(name, pos->type, THERMAL_NAME_LENGTH)) { found++; ref = pos; } if (!found) return ERR_PTR(-ENODEV); /* Success only when one zone is found. */ if (found > 1) return ERR_PTR(-EEXIST); return ref; } EXPORT_SYMBOL_GPL(thermal_zone_get_zone_by_name); static void thermal_zone_device_resume(struct work_struct *work) { struct thermal_zone_device *tz; tz = container_of(work, struct thermal_zone_device, poll_queue.work); guard(thermal_zone)(tz); tz->state &= ~(TZ_STATE_FLAG_SUSPENDED | TZ_STATE_FLAG_RESUMING); thermal_debug_tz_resume(tz); thermal_zone_device_init(tz); thermal_governor_update_tz(tz, THERMAL_TZ_RESUME); __thermal_zone_device_update(tz, THERMAL_TZ_RESUME); complete(&tz->resume); } static void thermal_zone_pm_prepare(struct thermal_zone_device *tz) { guard(thermal_zone)(tz); if (tz->state & TZ_STATE_FLAG_RESUMING) { /* * thermal_zone_device_resume() queued up for this zone has not * acquired the lock yet, so release it to let the function run * and wait util it has done the work. */ scoped_guard(thermal_zone_reverse, tz) { wait_for_completion(&tz->resume); } } tz->state |= TZ_STATE_FLAG_SUSPENDED; } static void thermal_pm_notify_prepare(void) { struct thermal_zone_device *tz; guard(mutex)(&thermal_list_lock); thermal_pm_suspended = true; list_for_each_entry(tz, &thermal_tz_list, node) thermal_zone_pm_prepare(tz); } static void thermal_zone_pm_complete(struct thermal_zone_device *tz) { guard(thermal_zone)(tz); cancel_delayed_work(&tz->poll_queue); reinit_completion(&tz->resume); tz->state |= TZ_STATE_FLAG_RESUMING; /* * Replace the work function with the resume one, which will restore the * original work function and schedule the polling work if needed. */ INIT_DELAYED_WORK(&tz->poll_queue, thermal_zone_device_resume); /* Queue up the work without a delay. */ mod_delayed_work(system_freezable_power_efficient_wq, &tz->poll_queue, 0); } static void thermal_pm_notify_complete(void) { struct thermal_zone_device *tz; guard(mutex)(&thermal_list_lock); thermal_pm_suspended = false; list_for_each_entry(tz, &thermal_tz_list, node) thermal_zone_pm_complete(tz); } static int thermal_pm_notify(struct notifier_block *nb, unsigned long mode, void *_unused) { switch (mode) { case PM_HIBERNATION_PREPARE: case PM_RESTORE_PREPARE: case PM_SUSPEND_PREPARE: thermal_pm_notify_prepare(); break; case PM_POST_HIBERNATION: case PM_POST_RESTORE: case PM_POST_SUSPEND: thermal_pm_notify_complete(); break; default: break; } return 0; } static struct notifier_block thermal_pm_nb = { .notifier_call = thermal_pm_notify, /* * Run at the lowest priority to avoid interference between the thermal * zone resume work items spawned by thermal_pm_notify() and the other * PM notifiers. */ .priority = INT_MIN, }; static int __init thermal_init(void) { int result; thermal_debug_init(); result = thermal_netlink_init(); if (result) goto error; result = thermal_register_governors(); if (result) goto unregister_netlink; thermal_class = kzalloc(sizeof(*thermal_class), GFP_KERNEL); if (!thermal_class) { result = -ENOMEM; goto unregister_governors; } thermal_class->name = "thermal"; thermal_class->dev_release = thermal_release; result = class_register(thermal_class); if (result) { kfree(thermal_class); thermal_class = NULL; goto unregister_governors; } result = register_pm_notifier(&thermal_pm_nb); if (result) pr_warn("Thermal: Can not register suspend notifier, return %d\n", result); return 0; unregister_governors: thermal_unregister_governors(); unregister_netlink: thermal_netlink_exit(); error: mutex_destroy(&thermal_list_lock); mutex_destroy(&thermal_governor_lock); return result; } postcore_initcall(thermal_init);
2 10 316 40 181 180 199 200 149 85 16 16 98 78 5 81 81 98 234 85 9 19 192 88 27 78 154 159 56 3 121 47 40 13 51 33 31 7 32 77 170 14 49 15 2 14 3 13 6 16 34 15 46 3 18 31 127 128 6 143 95 18 82 91 101 81 143 99 17 12 145 145 3 142 98 81 142 60 36 14 14 36 106 128 13 4 4 13 7 6 12 1 142 83 89 143 143 143 101 81 155 113 8 14 393 393 235 82 123 218 20 20 15 5 8 12 48 48 48 48 48 48 94 13 48 6 152 153 6 26 133 171 166 8 163 5 2 1 155 133 133 13 304 304 304 110 171 180 180 5 1 167 9 8 114 20 1 92 5 166 13 9 1 94 127 166 1 156 155 9 150 11 4 109 49 155 117 45 39 21 105 21 143 155 48 15 4 2 71 56 155 155 5 2 26 58 73 76 8 44 67 5 133 133 114 112 9 21 2 37 19 22 37 40 44 21 11 31 31 12 123 2 123 118 8 123 123 106 10 123 87 79 117 20 16 77 63 100 82 99 81 122 22 110 21 113 6 71 77 122 122 44 79 11 123 76 3 2 2 3 1 4 4 4 25 613 25 113 196 123 356 30 64 64 1 169 7 11 2 4035 3839 387 619 618 372 196 51 146 124 26 126 34 34 3 3 31 83 63 78 71 144 3 138 9 9 146 165 126 71 166 165 77 19 118 95 493 493 372 641 641 494 5 3 2 2 2 1 16 2 2 3 3 2 5 6 3 1 1 15 1 2 4 5 2 1 1 196 152 91 183 58 322 130 113 35 237 9 1 7 1 219 27 198 16 16 22 9 1 2 11 6 5 5 6 1 6 6 4 2 2 6 6 2 240 240 23 158 144 23 14 151 93 14 66 58 151 23 8 52 52 2 8 10 6 2 2 9 9 10 15 15 2 14 5 5 2 3 10 11 5 5 7 5 15 1 15 1 1 2 1 1 147 141 55 88 34 76 86 76 101 1 33 34 11 22 5 34 11 112 112 199 199 113 113 113 113 113 197 198 1 197 199 416 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 // SPDX-License-Identifier: GPL-1.0+ /* * originally based on the dummy device. * * Copyright 1999, Thomas Davis, tadavis@lbl.gov. * Based on dummy.c, and eql.c devices. * * bonding.c: an Ethernet Bonding driver * * This is useful to talk to a Cisco EtherChannel compatible equipment: * Cisco 5500 * Sun Trunking (Solaris) * Alteon AceDirector Trunks * Linux Bonding * and probably many L2 switches ... * * How it works: * ifconfig bond0 ipaddress netmask up * will setup a network device, with an ip address. No mac address * will be assigned at this time. The hw mac address will come from * the first slave bonded to the channel. All slaves will then use * this hw mac address. * * ifconfig bond0 down * will release all slaves, marking them as down. * * ifenslave bond0 eth0 * will attach eth0 to bond0 as a slave. eth0 hw mac address will either * a: be used as initial mac address * b: if a hw mac address already is there, eth0's hw mac address * will then be set from bond0. * */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/filter.h> #include <linux/interrupt.h> #include <linux/ptrace.h> #include <linux/ioport.h> #include <linux/in.h> #include <net/ip.h> #include <linux/ip.h> #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/init.h> #include <linux/timer.h> #include <linux/socket.h> #include <linux/ctype.h> #include <linux/inet.h> #include <linux/bitops.h> #include <linux/io.h> #include <asm/dma.h> #include <linux/uaccess.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/igmp.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/rtnetlink.h> #include <linux/smp.h> #include <linux/if_ether.h> #include <net/arp.h> #include <linux/mii.h> #include <linux/ethtool.h> #include <linux/if_vlan.h> #include <linux/if_bonding.h> #include <linux/phy.h> #include <linux/jiffies.h> #include <linux/preempt.h> #include <net/route.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/pkt_sched.h> #include <linux/rculist.h> #include <net/flow_dissector.h> #include <net/xfrm.h> #include <net/bonding.h> #include <net/bond_3ad.h> #include <net/bond_alb.h> #if IS_ENABLED(CONFIG_TLS_DEVICE) #include <net/tls.h> #endif #include <net/ip6_route.h> #include <net/netdev_lock.h> #include <net/xdp.h> #include "bonding_priv.h" /*---------------------------- Module parameters ----------------------------*/ /* monitor all links that often (in milliseconds). <=0 disables monitoring */ static int max_bonds = BOND_DEFAULT_MAX_BONDS; static int tx_queues = BOND_DEFAULT_TX_QUEUES; static int num_peer_notif = 1; static int miimon; static int updelay; static int downdelay; static int use_carrier = 1; static char *mode; static char *primary; static char *primary_reselect; static char *lacp_rate; static int min_links; static char *ad_select; static char *xmit_hash_policy; static int arp_interval; static char *arp_ip_target[BOND_MAX_ARP_TARGETS]; static char *arp_validate; static char *arp_all_targets; static char *fail_over_mac; static int all_slaves_active; static struct bond_params bonding_defaults; static int resend_igmp = BOND_DEFAULT_RESEND_IGMP; static int packets_per_slave = 1; static int lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL; module_param(max_bonds, int, 0); MODULE_PARM_DESC(max_bonds, "Max number of bonded devices"); module_param(tx_queues, int, 0); MODULE_PARM_DESC(tx_queues, "Max number of transmit queues (default = 16)"); module_param_named(num_grat_arp, num_peer_notif, int, 0644); MODULE_PARM_DESC(num_grat_arp, "Number of peer notifications to send on " "failover event (alias of num_unsol_na)"); module_param_named(num_unsol_na, num_peer_notif, int, 0644); MODULE_PARM_DESC(num_unsol_na, "Number of peer notifications to send on " "failover event (alias of num_grat_arp)"); module_param(miimon, int, 0); MODULE_PARM_DESC(miimon, "Link check interval in milliseconds"); module_param(updelay, int, 0); MODULE_PARM_DESC(updelay, "Delay before considering link up, in milliseconds"); module_param(downdelay, int, 0); MODULE_PARM_DESC(downdelay, "Delay before considering link down, " "in milliseconds"); module_param(use_carrier, int, 0); MODULE_PARM_DESC(use_carrier, "option obsolete, use_carrier cannot be disabled"); module_param(mode, charp, 0); MODULE_PARM_DESC(mode, "Mode of operation; 0 for balance-rr, " "1 for active-backup, 2 for balance-xor, " "3 for broadcast, 4 for 802.3ad, 5 for balance-tlb, " "6 for balance-alb"); module_param(primary, charp, 0); MODULE_PARM_DESC(primary, "Primary network device to use"); module_param(primary_reselect, charp, 0); MODULE_PARM_DESC(primary_reselect, "Reselect primary slave " "once it comes up; " "0 for always (default), " "1 for only if speed of primary is " "better, " "2 for only on active slave " "failure"); module_param(lacp_rate, charp, 0); MODULE_PARM_DESC(lacp_rate, "LACPDU tx rate to request from 802.3ad partner; " "0 for slow, 1 for fast"); module_param(ad_select, charp, 0); MODULE_PARM_DESC(ad_select, "802.3ad aggregation selection logic; " "0 for stable (default), 1 for bandwidth, " "2 for count"); module_param(min_links, int, 0); MODULE_PARM_DESC(min_links, "Minimum number of available links before turning on carrier"); module_param(xmit_hash_policy, charp, 0); MODULE_PARM_DESC(xmit_hash_policy, "balance-alb, balance-tlb, balance-xor, 802.3ad hashing method; " "0 for layer 2 (default), 1 for layer 3+4, " "2 for layer 2+3, 3 for encap layer 2+3, " "4 for encap layer 3+4, 5 for vlan+srcmac"); module_param(arp_interval, int, 0); MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds"); module_param_array(arp_ip_target, charp, NULL, 0); MODULE_PARM_DESC(arp_ip_target, "arp targets in n.n.n.n form"); module_param(arp_validate, charp, 0); MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes; " "0 for none (default), 1 for active, " "2 for backup, 3 for all"); module_param(arp_all_targets, charp, 0); MODULE_PARM_DESC(arp_all_targets, "fail on any/all arp targets timeout; 0 for any (default), 1 for all"); module_param(fail_over_mac, charp, 0); MODULE_PARM_DESC(fail_over_mac, "For active-backup, do not set all slaves to " "the same MAC; 0 for none (default), " "1 for active, 2 for follow"); module_param(all_slaves_active, int, 0); MODULE_PARM_DESC(all_slaves_active, "Keep all frames received on an interface " "by setting active flag for all slaves; " "0 for never (default), 1 for always."); module_param(resend_igmp, int, 0); MODULE_PARM_DESC(resend_igmp, "Number of IGMP membership reports to send on " "link failure"); module_param(packets_per_slave, int, 0); MODULE_PARM_DESC(packets_per_slave, "Packets to send per slave in balance-rr " "mode; 0 for a random slave, 1 packet per " "slave (default), >1 packets per slave."); module_param(lp_interval, uint, 0); MODULE_PARM_DESC(lp_interval, "The number of seconds between instances where " "the bonding driver sends learning packets to " "each slaves peer switch. The default is 1."); /*----------------------------- Global variables ----------------------------*/ #ifdef CONFIG_NET_POLL_CONTROLLER atomic_t netpoll_block_tx = ATOMIC_INIT(0); #endif unsigned int bond_net_id __read_mostly; DEFINE_STATIC_KEY_FALSE(bond_bcast_neigh_enabled); static const struct flow_dissector_key flow_keys_bonding_keys[] = { { .key_id = FLOW_DISSECTOR_KEY_CONTROL, .offset = offsetof(struct flow_keys, control), }, { .key_id = FLOW_DISSECTOR_KEY_BASIC, .offset = offsetof(struct flow_keys, basic), }, { .key_id = FLOW_DISSECTOR_KEY_IPV4_ADDRS, .offset = offsetof(struct flow_keys, addrs.v4addrs), }, { .key_id = FLOW_DISSECTOR_KEY_IPV6_ADDRS, .offset = offsetof(struct flow_keys, addrs.v6addrs), }, { .key_id = FLOW_DISSECTOR_KEY_TIPC, .offset = offsetof(struct flow_keys, addrs.tipckey), }, { .key_id = FLOW_DISSECTOR_KEY_PORTS, .offset = offsetof(struct flow_keys, ports), }, { .key_id = FLOW_DISSECTOR_KEY_ICMP, .offset = offsetof(struct flow_keys, icmp), }, { .key_id = FLOW_DISSECTOR_KEY_VLAN, .offset = offsetof(struct flow_keys, vlan), }, { .key_id = FLOW_DISSECTOR_KEY_FLOW_LABEL, .offset = offsetof(struct flow_keys, tags), }, { .key_id = FLOW_DISSECTOR_KEY_GRE_KEYID, .offset = offsetof(struct flow_keys, keyid), }, }; static struct flow_dissector flow_keys_bonding __read_mostly; /*-------------------------- Forward declarations ---------------------------*/ static int bond_init(struct net_device *bond_dev); static void bond_uninit(struct net_device *bond_dev); static void bond_get_stats(struct net_device *bond_dev, struct rtnl_link_stats64 *stats); static void bond_slave_arr_handler(struct work_struct *work); static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act, int mod); static void bond_netdev_notify_work(struct work_struct *work); /*---------------------------- General routines -----------------------------*/ const char *bond_mode_name(int mode) { static const char *names[] = { [BOND_MODE_ROUNDROBIN] = "load balancing (round-robin)", [BOND_MODE_ACTIVEBACKUP] = "fault-tolerance (active-backup)", [BOND_MODE_XOR] = "load balancing (xor)", [BOND_MODE_BROADCAST] = "fault-tolerance (broadcast)", [BOND_MODE_8023AD] = "IEEE 802.3ad Dynamic link aggregation", [BOND_MODE_TLB] = "transmit load balancing", [BOND_MODE_ALB] = "adaptive load balancing", }; if (mode < BOND_MODE_ROUNDROBIN || mode > BOND_MODE_ALB) return "unknown"; return names[mode]; } /** * bond_dev_queue_xmit - Prepare skb for xmit. * * @bond: bond device that got this skb for tx. * @skb: hw accel VLAN tagged skb to transmit * @slave_dev: slave that is supposed to xmit this skbuff */ netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *slave_dev) { skb->dev = slave_dev; BUILD_BUG_ON(sizeof(skb->queue_mapping) != sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); if (unlikely(netpoll_tx_running(bond->dev))) return bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb); return dev_queue_xmit(skb); } static bool bond_sk_check(struct bonding *bond) { switch (BOND_MODE(bond)) { case BOND_MODE_8023AD: case BOND_MODE_XOR: if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34) return true; fallthrough; default: return false; } } bool bond_xdp_check(struct bonding *bond, int mode) { switch (mode) { case BOND_MODE_ROUNDROBIN: case BOND_MODE_ACTIVEBACKUP: return true; case BOND_MODE_8023AD: case BOND_MODE_XOR: /* vlan+srcmac is not supported with XDP as in most cases the 802.1q * payload is not in the packet due to hardware offload. */ if (bond->params.xmit_policy != BOND_XMIT_POLICY_VLAN_SRCMAC) return true; fallthrough; default: return false; } } /*---------------------------------- VLAN -----------------------------------*/ /* In the following 2 functions, bond_vlan_rx_add_vid and bond_vlan_rx_kill_vid, * We don't protect the slave list iteration with a lock because: * a. This operation is performed in IOCTL context, * b. The operation is protected by the RTNL semaphore in the 8021q code, * c. Holding a lock with BH disabled while directly calling a base driver * entry point is generally a BAD idea. * * The design of synchronization/protection for this operation in the 8021q * module is good for one or more VLAN devices over a single physical device * and cannot be extended for a teaming solution like bonding, so there is a * potential race condition here where a net device from the vlan group might * be referenced (either by a base driver or the 8021q code) while it is being * removed from the system. However, it turns out we're not making matters * worse, and if it works for regular VLAN usage it will work here too. */ /** * bond_vlan_rx_add_vid - Propagates adding an id to slaves * @bond_dev: bonding net device that got called * @proto: network protocol ID * @vid: vlan id being added */ static int bond_vlan_rx_add_vid(struct net_device *bond_dev, __be16 proto, u16 vid) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; struct list_head *iter; int res; bond_for_each_slave(bond, slave, iter) { res = vlan_vid_add(slave->dev, proto, vid); if (res) goto unwind; } return 0; unwind: /* unwind to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { if (rollback_slave == slave) break; vlan_vid_del(rollback_slave->dev, proto, vid); } return res; } /** * bond_vlan_rx_kill_vid - Propagates deleting an id to slaves * @bond_dev: bonding net device that got called * @proto: network protocol ID * @vid: vlan id being removed */ static int bond_vlan_rx_kill_vid(struct net_device *bond_dev, __be16 proto, u16 vid) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; bond_for_each_slave(bond, slave, iter) vlan_vid_del(slave->dev, proto, vid); if (bond_is_lb(bond)) bond_alb_clear_vlan(bond, vid); return 0; } /*---------------------------------- XFRM -----------------------------------*/ #ifdef CONFIG_XFRM_OFFLOAD /** * bond_ipsec_dev - Get active device for IPsec offload * @xs: pointer to transformer state struct * * Context: caller must hold rcu_read_lock. * * Return: the device for ipsec offload, or NULL if not exist. **/ static struct net_device *bond_ipsec_dev(struct xfrm_state *xs) { struct net_device *bond_dev = xs->xso.dev; struct bonding *bond; struct slave *slave; bond = netdev_priv(bond_dev); if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) return NULL; slave = rcu_dereference(bond->curr_active_slave); if (!slave) return NULL; if (!xs->xso.real_dev) return NULL; if (xs->xso.real_dev != slave->dev) pr_warn_ratelimited("%s: (slave %s): not same with IPsec offload real dev %s\n", bond_dev->name, slave->dev->name, xs->xso.real_dev->name); return slave->dev; } /** * bond_ipsec_add_sa - program device with a security association * @bond_dev: pointer to the bond net device * @xs: pointer to transformer state struct * @extack: extack point to fill failure reason **/ static int bond_ipsec_add_sa(struct net_device *bond_dev, struct xfrm_state *xs, struct netlink_ext_ack *extack) { struct net_device *real_dev; netdevice_tracker tracker; struct bond_ipsec *ipsec; struct bonding *bond; struct slave *slave; int err; if (!bond_dev) return -EINVAL; rcu_read_lock(); bond = netdev_priv(bond_dev); slave = rcu_dereference(bond->curr_active_slave); real_dev = slave ? slave->dev : NULL; netdev_hold(real_dev, &tracker, GFP_ATOMIC); rcu_read_unlock(); if (!real_dev) { err = -ENODEV; goto out; } if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_add || netif_is_bond_master(real_dev)) { NL_SET_ERR_MSG_MOD(extack, "Slave does not support ipsec offload"); err = -EINVAL; goto out; } ipsec = kmalloc(sizeof(*ipsec), GFP_KERNEL); if (!ipsec) { err = -ENOMEM; goto out; } err = real_dev->xfrmdev_ops->xdo_dev_state_add(real_dev, xs, extack); if (!err) { xs->xso.real_dev = real_dev; ipsec->xs = xs; INIT_LIST_HEAD(&ipsec->list); mutex_lock(&bond->ipsec_lock); list_add(&ipsec->list, &bond->ipsec_list); mutex_unlock(&bond->ipsec_lock); } else { kfree(ipsec); } out: netdev_put(real_dev, &tracker); return err; } static void bond_ipsec_add_sa_all(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct net_device *real_dev; struct bond_ipsec *ipsec; struct slave *slave; slave = rtnl_dereference(bond->curr_active_slave); real_dev = slave ? slave->dev : NULL; if (!real_dev) return; mutex_lock(&bond->ipsec_lock); if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_add || netif_is_bond_master(real_dev)) { if (!list_empty(&bond->ipsec_list)) slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_add\n", __func__); goto out; } list_for_each_entry(ipsec, &bond->ipsec_list, list) { /* If new state is added before ipsec_lock acquired */ if (ipsec->xs->xso.real_dev == real_dev) continue; if (real_dev->xfrmdev_ops->xdo_dev_state_add(real_dev, ipsec->xs, NULL)) { slave_warn(bond_dev, real_dev, "%s: failed to add SA\n", __func__); continue; } spin_lock_bh(&ipsec->xs->lock); /* xs might have been killed by the user during the migration * to the new dev, but bond_ipsec_del_sa() should have done * nothing, as xso.real_dev is NULL. * Delete it from the device we just added it to. The pending * bond_ipsec_free_sa() call will do the rest of the cleanup. */ if (ipsec->xs->km.state == XFRM_STATE_DEAD && real_dev->xfrmdev_ops->xdo_dev_state_delete) real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, ipsec->xs); ipsec->xs->xso.real_dev = real_dev; spin_unlock_bh(&ipsec->xs->lock); } out: mutex_unlock(&bond->ipsec_lock); } /** * bond_ipsec_del_sa - clear out this specific SA * @bond_dev: pointer to the bond net device * @xs: pointer to transformer state struct **/ static void bond_ipsec_del_sa(struct net_device *bond_dev, struct xfrm_state *xs) { struct net_device *real_dev; if (!bond_dev || !xs->xso.real_dev) return; real_dev = xs->xso.real_dev; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_delete || netif_is_bond_master(real_dev)) { slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); return; } real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, xs); } static void bond_ipsec_del_sa_all(struct bonding *bond) { struct net_device *bond_dev = bond->dev; struct net_device *real_dev; struct bond_ipsec *ipsec; struct slave *slave; slave = rtnl_dereference(bond->curr_active_slave); real_dev = slave ? slave->dev : NULL; if (!real_dev) return; mutex_lock(&bond->ipsec_lock); list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (!ipsec->xs->xso.real_dev) continue; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_delete || netif_is_bond_master(real_dev)) { slave_warn(bond_dev, real_dev, "%s: no slave xdo_dev_state_delete\n", __func__); continue; } spin_lock_bh(&ipsec->xs->lock); ipsec->xs->xso.real_dev = NULL; /* Don't double delete states killed by the user. */ if (ipsec->xs->km.state != XFRM_STATE_DEAD) real_dev->xfrmdev_ops->xdo_dev_state_delete(real_dev, ipsec->xs); spin_unlock_bh(&ipsec->xs->lock); if (real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, ipsec->xs); } mutex_unlock(&bond->ipsec_lock); } static void bond_ipsec_free_sa(struct net_device *bond_dev, struct xfrm_state *xs) { struct net_device *real_dev; struct bond_ipsec *ipsec; struct bonding *bond; if (!bond_dev) return; bond = netdev_priv(bond_dev); mutex_lock(&bond->ipsec_lock); if (!xs->xso.real_dev) goto out; real_dev = xs->xso.real_dev; xs->xso.real_dev = NULL; if (real_dev->xfrmdev_ops && real_dev->xfrmdev_ops->xdo_dev_state_free) real_dev->xfrmdev_ops->xdo_dev_state_free(real_dev, xs); out: list_for_each_entry(ipsec, &bond->ipsec_list, list) { if (ipsec->xs == xs) { list_del(&ipsec->list); kfree(ipsec); break; } } mutex_unlock(&bond->ipsec_lock); } /** * bond_ipsec_offload_ok - can this packet use the xfrm hw offload * @skb: current data packet * @xs: pointer to transformer state struct **/ static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs) { struct net_device *real_dev; rcu_read_lock(); real_dev = bond_ipsec_dev(xs); if (!real_dev || netif_is_bond_master(real_dev)) { rcu_read_unlock(); return false; } rcu_read_unlock(); return true; } /** * bond_advance_esn_state - ESN support for IPSec HW offload * @xs: pointer to transformer state struct **/ static void bond_advance_esn_state(struct xfrm_state *xs) { struct net_device *real_dev; rcu_read_lock(); real_dev = bond_ipsec_dev(xs); if (!real_dev) goto out; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_advance_esn) { pr_warn_ratelimited("%s: %s doesn't support xdo_dev_state_advance_esn\n", __func__, real_dev->name); goto out; } real_dev->xfrmdev_ops->xdo_dev_state_advance_esn(xs); out: rcu_read_unlock(); } /** * bond_xfrm_update_stats - Update xfrm state * @xs: pointer to transformer state struct **/ static void bond_xfrm_update_stats(struct xfrm_state *xs) { struct net_device *real_dev; rcu_read_lock(); real_dev = bond_ipsec_dev(xs); if (!real_dev) goto out; if (!real_dev->xfrmdev_ops || !real_dev->xfrmdev_ops->xdo_dev_state_update_stats) { pr_warn_ratelimited("%s: %s doesn't support xdo_dev_state_update_stats\n", __func__, real_dev->name); goto out; } real_dev->xfrmdev_ops->xdo_dev_state_update_stats(xs); out: rcu_read_unlock(); } static const struct xfrmdev_ops bond_xfrmdev_ops = { .xdo_dev_state_add = bond_ipsec_add_sa, .xdo_dev_state_delete = bond_ipsec_del_sa, .xdo_dev_state_free = bond_ipsec_free_sa, .xdo_dev_offload_ok = bond_ipsec_offload_ok, .xdo_dev_state_advance_esn = bond_advance_esn_state, .xdo_dev_state_update_stats = bond_xfrm_update_stats, }; #endif /* CONFIG_XFRM_OFFLOAD */ /*------------------------------- Link status -------------------------------*/ /* Set the carrier state for the master according to the state of its * slaves. If any slaves are up, the master is up. In 802.3ad mode, * do special 802.3ad magic. * * Returns zero if carrier state does not change, nonzero if it does. */ int bond_set_carrier(struct bonding *bond) { struct list_head *iter; struct slave *slave; if (!bond_has_slaves(bond)) goto down; if (BOND_MODE(bond) == BOND_MODE_8023AD) return bond_3ad_set_carrier(bond); bond_for_each_slave(bond, slave, iter) { if (slave->link == BOND_LINK_UP) { if (!netif_carrier_ok(bond->dev)) { netif_carrier_on(bond->dev); return 1; } return 0; } } down: if (netif_carrier_ok(bond->dev)) { netif_carrier_off(bond->dev); return 1; } return 0; } /* Get link speed and duplex from the slave's base driver * using ethtool. If for some reason the call fails or the * values are invalid, set speed and duplex to -1, * and return. Return 1 if speed or duplex settings are * UNKNOWN; 0 otherwise. */ static int bond_update_speed_duplex(struct slave *slave) { struct net_device *slave_dev = slave->dev; struct ethtool_link_ksettings ecmd; int res; slave->speed = SPEED_UNKNOWN; slave->duplex = DUPLEX_UNKNOWN; res = __ethtool_get_link_ksettings(slave_dev, &ecmd); if (res < 0) return 1; if (ecmd.base.speed == 0 || ecmd.base.speed == ((__u32)-1)) return 1; switch (ecmd.base.duplex) { case DUPLEX_FULL: case DUPLEX_HALF: break; default: return 1; } slave->speed = ecmd.base.speed; slave->duplex = ecmd.base.duplex; return 0; } const char *bond_slave_link_status(s8 link) { switch (link) { case BOND_LINK_UP: return "up"; case BOND_LINK_FAIL: return "going down"; case BOND_LINK_DOWN: return "down"; case BOND_LINK_BACK: return "going back"; default: return "unknown"; } } /*----------------------------- Multicast list ------------------------------*/ /* Push the promiscuity flag down to appropriate slaves */ static int bond_set_promiscuity(struct bonding *bond, int inc) { struct list_head *iter; int err = 0; if (bond_uses_primary(bond)) { struct slave *curr_active = rtnl_dereference(bond->curr_active_slave); if (curr_active) err = dev_set_promiscuity(curr_active->dev, inc); } else { struct slave *slave; bond_for_each_slave(bond, slave, iter) { err = dev_set_promiscuity(slave->dev, inc); if (err) return err; } } return err; } /* Push the allmulti flag down to all slaves */ static int bond_set_allmulti(struct bonding *bond, int inc) { struct list_head *iter; int err = 0; if (bond_uses_primary(bond)) { struct slave *curr_active = rtnl_dereference(bond->curr_active_slave); if (curr_active) err = dev_set_allmulti(curr_active->dev, inc); } else { struct slave *slave; bond_for_each_slave(bond, slave, iter) { err = dev_set_allmulti(slave->dev, inc); if (err) return err; } } return err; } /* Retrieve the list of registered multicast addresses for the bonding * device and retransmit an IGMP JOIN request to the current active * slave. */ static void bond_resend_igmp_join_requests_delayed(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, mcast_work.work); if (!rtnl_trylock()) { queue_delayed_work(bond->wq, &bond->mcast_work, 1); return; } call_netdevice_notifiers(NETDEV_RESEND_IGMP, bond->dev); if (bond->igmp_retrans > 1) { bond->igmp_retrans--; queue_delayed_work(bond->wq, &bond->mcast_work, HZ/5); } rtnl_unlock(); } /* Flush bond's hardware addresses from slave */ static void bond_hw_addr_flush(struct net_device *bond_dev, struct net_device *slave_dev) { struct bonding *bond = netdev_priv(bond_dev); dev_uc_unsync(slave_dev, bond_dev); dev_mc_unsync(slave_dev, bond_dev); if (BOND_MODE(bond) == BOND_MODE_8023AD) dev_mc_del(slave_dev, lacpdu_mcast_addr); } /*--------------------------- Active slave change ---------------------------*/ /* Update the hardware address list and promisc/allmulti for the new and * old active slaves (if any). Modes that are not using primary keep all * slaves up date at all times; only the modes that use primary need to call * this function to swap these settings during a failover. */ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active, struct slave *old_active) { if (old_active) { if (bond->dev->flags & IFF_PROMISC) dev_set_promiscuity(old_active->dev, -1); if (bond->dev->flags & IFF_ALLMULTI) dev_set_allmulti(old_active->dev, -1); if (bond->dev->flags & IFF_UP) bond_hw_addr_flush(bond->dev, old_active->dev); bond_slave_ns_maddrs_add(bond, old_active); } if (new_active) { /* FIXME: Signal errors upstream. */ if (bond->dev->flags & IFF_PROMISC) dev_set_promiscuity(new_active->dev, 1); if (bond->dev->flags & IFF_ALLMULTI) dev_set_allmulti(new_active->dev, 1); if (bond->dev->flags & IFF_UP) { netif_addr_lock_bh(bond->dev); dev_uc_sync(new_active->dev, bond->dev); dev_mc_sync(new_active->dev, bond->dev); netif_addr_unlock_bh(bond->dev); } bond_slave_ns_maddrs_del(bond, new_active); } } /** * bond_set_dev_addr - clone slave's address to bond * @bond_dev: bond net device * @slave_dev: slave net device * * Should be called with RTNL held. */ static int bond_set_dev_addr(struct net_device *bond_dev, struct net_device *slave_dev) { int err; slave_dbg(bond_dev, slave_dev, "bond_dev=%p slave_dev=%p slave_dev->addr_len=%d\n", bond_dev, slave_dev, slave_dev->addr_len); err = netif_pre_changeaddr_notify(bond_dev, slave_dev->dev_addr, NULL); if (err) return err; __dev_addr_set(bond_dev, slave_dev->dev_addr, slave_dev->addr_len); bond_dev->addr_assign_type = NET_ADDR_STOLEN; call_netdevice_notifiers(NETDEV_CHANGEADDR, bond_dev); return 0; } static struct slave *bond_get_old_active(struct bonding *bond, struct slave *new_active) { struct slave *slave; struct list_head *iter; bond_for_each_slave(bond, slave, iter) { if (slave == new_active) continue; if (ether_addr_equal(bond->dev->dev_addr, slave->dev->dev_addr)) return slave; } return NULL; } /* bond_do_fail_over_mac * * Perform special MAC address swapping for fail_over_mac settings * * Called with RTNL */ static void bond_do_fail_over_mac(struct bonding *bond, struct slave *new_active, struct slave *old_active) { u8 tmp_mac[MAX_ADDR_LEN]; struct sockaddr_storage ss; int rv; switch (bond->params.fail_over_mac) { case BOND_FOM_ACTIVE: if (new_active) { rv = bond_set_dev_addr(bond->dev, new_active->dev); if (rv) slave_err(bond->dev, new_active->dev, "Error %d setting bond MAC from slave\n", -rv); } break; case BOND_FOM_FOLLOW: /* if new_active && old_active, swap them * if just old_active, do nothing (going to no active slave) * if just new_active, set new_active to bond's MAC */ if (!new_active) return; if (!old_active) old_active = bond_get_old_active(bond, new_active); if (old_active) { bond_hw_addr_copy(tmp_mac, new_active->dev->dev_addr, new_active->dev->addr_len); bond_hw_addr_copy(ss.__data, old_active->dev->dev_addr, old_active->dev->addr_len); ss.ss_family = new_active->dev->type; } else { bond_hw_addr_copy(ss.__data, bond->dev->dev_addr, bond->dev->addr_len); ss.ss_family = bond->dev->type; } rv = dev_set_mac_address(new_active->dev, &ss, NULL); if (rv) { slave_err(bond->dev, new_active->dev, "Error %d setting MAC of new active slave\n", -rv); goto out; } if (!old_active) goto out; bond_hw_addr_copy(ss.__data, tmp_mac, new_active->dev->addr_len); ss.ss_family = old_active->dev->type; rv = dev_set_mac_address(old_active->dev, &ss, NULL); if (rv) slave_err(bond->dev, old_active->dev, "Error %d setting MAC of old active slave\n", -rv); out: break; default: netdev_err(bond->dev, "bond_do_fail_over_mac impossible: bad policy %d\n", bond->params.fail_over_mac); break; } } /** * bond_choose_primary_or_current - select the primary or high priority slave * @bond: our bonding struct * * - Check if there is a primary link. If the primary link was set and is up, * go on and do link reselection. * * - If primary link is not set or down, find the highest priority link. * If the highest priority link is not current slave, set it as primary * link and do link reselection. */ static struct slave *bond_choose_primary_or_current(struct bonding *bond) { struct slave *prim = rtnl_dereference(bond->primary_slave); struct slave *curr = rtnl_dereference(bond->curr_active_slave); struct slave *slave, *hprio = NULL; struct list_head *iter; if (!prim || prim->link != BOND_LINK_UP) { bond_for_each_slave(bond, slave, iter) { if (slave->link == BOND_LINK_UP) { hprio = hprio ?: slave; if (slave->prio > hprio->prio) hprio = slave; } } if (hprio && hprio != curr) { prim = hprio; goto link_reselect; } if (!curr || curr->link != BOND_LINK_UP) return NULL; return curr; } if (bond->force_primary) { bond->force_primary = false; return prim; } link_reselect: if (!curr || curr->link != BOND_LINK_UP) return prim; /* At this point, prim and curr are both up */ switch (bond->params.primary_reselect) { case BOND_PRI_RESELECT_ALWAYS: return prim; case BOND_PRI_RESELECT_BETTER: if (prim->speed < curr->speed) return curr; if (prim->speed == curr->speed && prim->duplex <= curr->duplex) return curr; return prim; case BOND_PRI_RESELECT_FAILURE: return curr; default: netdev_err(bond->dev, "impossible primary_reselect %d\n", bond->params.primary_reselect); return curr; } } /** * bond_find_best_slave - select the best available slave to be the active one * @bond: our bonding struct */ static struct slave *bond_find_best_slave(struct bonding *bond) { struct slave *slave, *bestslave = NULL; struct list_head *iter; int mintime = bond->params.updelay; slave = bond_choose_primary_or_current(bond); if (slave) return slave; bond_for_each_slave(bond, slave, iter) { if (slave->link == BOND_LINK_UP) return slave; if (slave->link == BOND_LINK_BACK && bond_slave_is_up(slave) && slave->delay < mintime) { mintime = slave->delay; bestslave = slave; } } return bestslave; } /* must be called in RCU critical section or with RTNL held */ static bool bond_should_notify_peers(struct bonding *bond) { struct bond_up_slave *usable; struct slave *slave = NULL; if (!bond->send_peer_notif || bond->send_peer_notif % max(1, bond->params.peer_notif_delay) != 0 || !netif_carrier_ok(bond->dev)) return false; /* The send_peer_notif is set by active-backup or 8023ad * mode, and cleared in bond_close() when changing mode. * It is safe to only check bond mode here. */ if (BOND_MODE(bond) == BOND_MODE_8023AD) { usable = rcu_dereference_rtnl(bond->usable_slaves); if (!usable || !READ_ONCE(usable->count)) return false; } else { slave = rcu_dereference_rtnl(bond->curr_active_slave); if (!slave || test_bit(__LINK_STATE_LINKWATCH_PENDING, &slave->dev->state)) return false; } netdev_dbg(bond->dev, "bond_should_notify_peers: slave %s\n", slave ? slave->dev->name : "all"); return true; } /** * bond_change_active_slave - change the active slave into the specified one * @bond: our bonding struct * @new_active: the new slave to make the active one * * Set the new slave to the bond's settings and unset them on the old * curr_active_slave. * Setting include flags, mc-list, promiscuity, allmulti, etc. * * If @new's link state is %BOND_LINK_BACK we'll set it to %BOND_LINK_UP, * because it is apparently the best available slave we have, even though its * updelay hasn't timed out yet. * * Caller must hold RTNL. */ void bond_change_active_slave(struct bonding *bond, struct slave *new_active) { struct slave *old_active; ASSERT_RTNL(); old_active = rtnl_dereference(bond->curr_active_slave); if (old_active == new_active) return; #ifdef CONFIG_XFRM_OFFLOAD bond_ipsec_del_sa_all(bond); #endif /* CONFIG_XFRM_OFFLOAD */ if (new_active) { new_active->last_link_up = jiffies; if (new_active->link == BOND_LINK_BACK) { if (bond_uses_primary(bond)) { slave_info(bond->dev, new_active->dev, "making interface the new active one %d ms earlier\n", (bond->params.updelay - new_active->delay) * bond->params.miimon); } new_active->delay = 0; bond_set_slave_link_state(new_active, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_handle_link_change(new_active, BOND_LINK_UP); if (bond_is_lb(bond)) bond_alb_handle_link_change(bond, new_active, BOND_LINK_UP); } else { if (bond_uses_primary(bond)) slave_info(bond->dev, new_active->dev, "making interface the new active one\n"); } } if (bond_uses_primary(bond)) bond_hw_addr_swap(bond, new_active, old_active); if (bond_is_lb(bond)) { bond_alb_handle_active_change(bond, new_active); if (old_active) bond_set_slave_inactive_flags(old_active, BOND_SLAVE_NOTIFY_NOW); if (new_active) bond_set_slave_active_flags(new_active, BOND_SLAVE_NOTIFY_NOW); } else { rcu_assign_pointer(bond->curr_active_slave, new_active); } if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) { if (old_active) bond_set_slave_inactive_flags(old_active, BOND_SLAVE_NOTIFY_NOW); if (new_active) { bool should_notify_peers = false; bond_set_slave_active_flags(new_active, BOND_SLAVE_NOTIFY_NOW); if (bond->params.fail_over_mac) bond_do_fail_over_mac(bond, new_active, old_active); if (netif_running(bond->dev)) { bond->send_peer_notif = bond->params.num_peer_notif * max(1, bond->params.peer_notif_delay); should_notify_peers = bond_should_notify_peers(bond); } call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, bond->dev); if (should_notify_peers) { bond->send_peer_notif--; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); } } } #ifdef CONFIG_XFRM_OFFLOAD bond_ipsec_add_sa_all(bond); #endif /* CONFIG_XFRM_OFFLOAD */ /* resend IGMP joins since active slave has changed or * all were sent on curr_active_slave. * resend only if bond is brought up with the affected * bonding modes and the retransmission is enabled */ if (netif_running(bond->dev) && (bond->params.resend_igmp > 0) && ((bond_uses_primary(bond) && new_active) || BOND_MODE(bond) == BOND_MODE_ROUNDROBIN)) { bond->igmp_retrans = bond->params.resend_igmp; queue_delayed_work(bond->wq, &bond->mcast_work, 1); } } /** * bond_select_active_slave - select a new active slave, if needed * @bond: our bonding struct * * This functions should be called when one of the following occurs: * - The old curr_active_slave has been released or lost its link. * - The primary_slave has got its link back. * - A slave has got its link back and there's no old curr_active_slave. * * Caller must hold RTNL. */ void bond_select_active_slave(struct bonding *bond) { struct slave *best_slave; int rv; ASSERT_RTNL(); best_slave = bond_find_best_slave(bond); if (best_slave != rtnl_dereference(bond->curr_active_slave)) { bond_change_active_slave(bond, best_slave); rv = bond_set_carrier(bond); if (!rv) return; if (netif_carrier_ok(bond->dev)) netdev_info(bond->dev, "active interface up!\n"); else netdev_info(bond->dev, "now running without any active interface!\n"); } } #ifdef CONFIG_NET_POLL_CONTROLLER static inline int slave_enable_netpoll(struct slave *slave) { struct netpoll *np; int err = 0; np = kzalloc(sizeof(*np), GFP_KERNEL); err = -ENOMEM; if (!np) goto out; err = __netpoll_setup(np, slave->dev); if (err) { kfree(np); goto out; } slave->np = np; out: return err; } static inline void slave_disable_netpoll(struct slave *slave) { struct netpoll *np = slave->np; if (!np) return; slave->np = NULL; __netpoll_free(np); } static void bond_poll_controller(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave = NULL; struct list_head *iter; struct ad_info ad_info; if (BOND_MODE(bond) == BOND_MODE_8023AD) if (bond_3ad_get_active_agg_info(bond, &ad_info)) return; bond_for_each_slave_rcu(bond, slave, iter) { if (!bond_slave_is_up(slave)) continue; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct aggregator *agg = SLAVE_AD_INFO(slave)->port.aggregator; if (agg && agg->aggregator_identifier != ad_info.aggregator_id) continue; } netpoll_poll_dev(slave->dev); } } static void bond_netpoll_cleanup(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; bond_for_each_slave(bond, slave, iter) if (bond_slave_is_up(slave)) slave_disable_netpoll(slave); } static int bond_netpoll_setup(struct net_device *dev) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; struct slave *slave; int err = 0; bond_for_each_slave(bond, slave, iter) { err = slave_enable_netpoll(slave); if (err) { bond_netpoll_cleanup(dev); break; } } return err; } #else static inline int slave_enable_netpoll(struct slave *slave) { return 0; } static inline void slave_disable_netpoll(struct slave *slave) { } static void bond_netpoll_cleanup(struct net_device *bond_dev) { } #endif /*---------------------------------- IOCTL ----------------------------------*/ static netdev_features_t bond_fix_features(struct net_device *dev, netdev_features_t features) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; netdev_features_t mask; struct slave *slave; mask = features; features = netdev_base_features(features); bond_for_each_slave(bond, slave, iter) { features = netdev_increment_features(features, slave->dev->features, mask); } features = netdev_add_tso_features(features, mask); return features; } #define BOND_VLAN_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_ENCAP_ALL | \ NETIF_F_HIGHDMA | NETIF_F_LRO) #define BOND_ENC_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_RXCSUM | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_PARTIAL) #define BOND_MPLS_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | \ NETIF_F_GSO_SOFTWARE) #define BOND_GSO_PARTIAL_FEATURES (NETIF_F_GSO_ESP) static void bond_compute_features(struct bonding *bond) { netdev_features_t gso_partial_features = BOND_GSO_PARTIAL_FEATURES; unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; netdev_features_t vlan_features = BOND_VLAN_FEATURES; netdev_features_t enc_features = BOND_ENC_FEATURES; #ifdef CONFIG_XFRM_OFFLOAD netdev_features_t xfrm_features = BOND_XFRM_FEATURES; #endif /* CONFIG_XFRM_OFFLOAD */ netdev_features_t mpls_features = BOND_MPLS_FEATURES; struct net_device *bond_dev = bond->dev; struct list_head *iter; struct slave *slave; unsigned short max_hard_header_len = ETH_HLEN; unsigned int tso_max_size = TSO_MAX_SIZE; u16 tso_max_segs = TSO_MAX_SEGS; if (!bond_has_slaves(bond)) goto done; vlan_features = netdev_base_features(vlan_features); mpls_features = netdev_base_features(mpls_features); bond_for_each_slave(bond, slave, iter) { vlan_features = netdev_increment_features(vlan_features, slave->dev->vlan_features, BOND_VLAN_FEATURES); enc_features = netdev_increment_features(enc_features, slave->dev->hw_enc_features, BOND_ENC_FEATURES); #ifdef CONFIG_XFRM_OFFLOAD xfrm_features = netdev_increment_features(xfrm_features, slave->dev->hw_enc_features, BOND_XFRM_FEATURES); #endif /* CONFIG_XFRM_OFFLOAD */ gso_partial_features = netdev_increment_features(gso_partial_features, slave->dev->gso_partial_features, BOND_GSO_PARTIAL_FEATURES); mpls_features = netdev_increment_features(mpls_features, slave->dev->mpls_features, BOND_MPLS_FEATURES); dst_release_flag &= slave->dev->priv_flags; if (slave->dev->hard_header_len > max_hard_header_len) max_hard_header_len = slave->dev->hard_header_len; tso_max_size = min(tso_max_size, slave->dev->tso_max_size); tso_max_segs = min(tso_max_segs, slave->dev->tso_max_segs); } bond_dev->hard_header_len = max_hard_header_len; done: bond_dev->gso_partial_features = gso_partial_features; bond_dev->vlan_features = vlan_features; bond_dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; #ifdef CONFIG_XFRM_OFFLOAD bond_dev->hw_enc_features |= xfrm_features; #endif /* CONFIG_XFRM_OFFLOAD */ bond_dev->mpls_features = mpls_features; netif_set_tso_max_segs(bond_dev, tso_max_segs); netif_set_tso_max_size(bond_dev, tso_max_size); bond_dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; if ((bond_dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) bond_dev->priv_flags |= IFF_XMIT_DST_RELEASE; netdev_change_features(bond_dev); } static void bond_setup_by_slave(struct net_device *bond_dev, struct net_device *slave_dev) { bool was_up = !!(bond_dev->flags & IFF_UP); dev_close(bond_dev); bond_dev->header_ops = slave_dev->header_ops; bond_dev->type = slave_dev->type; bond_dev->hard_header_len = slave_dev->hard_header_len; bond_dev->needed_headroom = slave_dev->needed_headroom; bond_dev->addr_len = slave_dev->addr_len; memcpy(bond_dev->broadcast, slave_dev->broadcast, slave_dev->addr_len); if (slave_dev->flags & IFF_POINTOPOINT) { bond_dev->flags &= ~(IFF_BROADCAST | IFF_MULTICAST); bond_dev->flags |= (IFF_POINTOPOINT | IFF_NOARP); } if (was_up) dev_open(bond_dev, NULL); } /* On bonding slaves other than the currently active slave, suppress * duplicates except for alb non-mcast/bcast. */ static bool bond_should_deliver_exact_match(struct sk_buff *skb, struct slave *slave, struct bonding *bond) { if (bond_is_slave_inactive(slave)) { if (BOND_MODE(bond) == BOND_MODE_ALB && skb->pkt_type != PACKET_BROADCAST && skb->pkt_type != PACKET_MULTICAST) return false; return true; } return false; } static rx_handler_result_t bond_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct slave *slave; struct bonding *bond; int (*recv_probe)(const struct sk_buff *, struct bonding *, struct slave *); int ret = RX_HANDLER_ANOTHER; skb = skb_share_check(skb, GFP_ATOMIC); if (unlikely(!skb)) return RX_HANDLER_CONSUMED; *pskb = skb; slave = bond_slave_get_rcu(skb->dev); bond = slave->bond; recv_probe = READ_ONCE(bond->recv_probe); if (recv_probe) { ret = recv_probe(skb, bond, slave); if (ret == RX_HANDLER_CONSUMED) { consume_skb(skb); return ret; } } /* * For packets determined by bond_should_deliver_exact_match() call to * be suppressed we want to make an exception for link-local packets. * This is necessary for e.g. LLDP daemons to be able to monitor * inactive slave links without being forced to bind to them * explicitly. * * At the same time, packets that are passed to the bonding master * (including link-local ones) can have their originating interface * determined via PACKET_ORIGDEV socket option. */ if (bond_should_deliver_exact_match(skb, slave, bond)) { if (is_link_local_ether_addr(eth_hdr(skb)->h_dest)) return RX_HANDLER_PASS; return RX_HANDLER_EXACT; } skb->dev = bond->dev; if (BOND_MODE(bond) == BOND_MODE_ALB && netif_is_bridge_port(bond->dev) && skb->pkt_type == PACKET_HOST) { if (unlikely(skb_cow_head(skb, skb->data - skb_mac_header(skb)))) { kfree_skb(skb); return RX_HANDLER_CONSUMED; } bond_hw_addr_copy(eth_hdr(skb)->h_dest, bond->dev->dev_addr, bond->dev->addr_len); } return ret; } static enum netdev_lag_tx_type bond_lag_tx_type(struct bonding *bond) { switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: return NETDEV_LAG_TX_TYPE_ROUNDROBIN; case BOND_MODE_ACTIVEBACKUP: return NETDEV_LAG_TX_TYPE_ACTIVEBACKUP; case BOND_MODE_BROADCAST: return NETDEV_LAG_TX_TYPE_BROADCAST; case BOND_MODE_XOR: case BOND_MODE_8023AD: return NETDEV_LAG_TX_TYPE_HASH; default: return NETDEV_LAG_TX_TYPE_UNKNOWN; } } static enum netdev_lag_hash bond_lag_hash_type(struct bonding *bond, enum netdev_lag_tx_type type) { if (type != NETDEV_LAG_TX_TYPE_HASH) return NETDEV_LAG_HASH_NONE; switch (bond->params.xmit_policy) { case BOND_XMIT_POLICY_LAYER2: return NETDEV_LAG_HASH_L2; case BOND_XMIT_POLICY_LAYER34: return NETDEV_LAG_HASH_L34; case BOND_XMIT_POLICY_LAYER23: return NETDEV_LAG_HASH_L23; case BOND_XMIT_POLICY_ENCAP23: return NETDEV_LAG_HASH_E23; case BOND_XMIT_POLICY_ENCAP34: return NETDEV_LAG_HASH_E34; case BOND_XMIT_POLICY_VLAN_SRCMAC: return NETDEV_LAG_HASH_VLAN_SRCMAC; default: return NETDEV_LAG_HASH_UNKNOWN; } } static int bond_master_upper_dev_link(struct bonding *bond, struct slave *slave, struct netlink_ext_ack *extack) { struct netdev_lag_upper_info lag_upper_info; enum netdev_lag_tx_type type; int err; type = bond_lag_tx_type(bond); lag_upper_info.tx_type = type; lag_upper_info.hash_type = bond_lag_hash_type(bond, type); err = netdev_master_upper_dev_link(slave->dev, bond->dev, slave, &lag_upper_info, extack); if (err) return err; slave->dev->flags |= IFF_SLAVE; return 0; } static void bond_upper_dev_unlink(struct bonding *bond, struct slave *slave) { netdev_upper_dev_unlink(slave->dev, bond->dev); slave->dev->flags &= ~IFF_SLAVE; } static void slave_kobj_release(struct kobject *kobj) { struct slave *slave = to_slave(kobj); struct bonding *bond = bond_get_bond_by_slave(slave); cancel_delayed_work_sync(&slave->notify_work); if (BOND_MODE(bond) == BOND_MODE_8023AD) kfree(SLAVE_AD_INFO(slave)); kfree(slave); } static struct kobj_type slave_ktype = { .release = slave_kobj_release, #ifdef CONFIG_SYSFS .sysfs_ops = &slave_sysfs_ops, #endif }; static int bond_kobj_init(struct slave *slave) { int err; err = kobject_init_and_add(&slave->kobj, &slave_ktype, &(slave->dev->dev.kobj), "bonding_slave"); if (err) kobject_put(&slave->kobj); return err; } static struct slave *bond_alloc_slave(struct bonding *bond, struct net_device *slave_dev) { struct slave *slave = NULL; slave = kzalloc(sizeof(*slave), GFP_KERNEL); if (!slave) return NULL; slave->bond = bond; slave->dev = slave_dev; INIT_DELAYED_WORK(&slave->notify_work, bond_netdev_notify_work); if (bond_kobj_init(slave)) return NULL; if (BOND_MODE(bond) == BOND_MODE_8023AD) { SLAVE_AD_INFO(slave) = kzalloc(sizeof(struct ad_slave_info), GFP_KERNEL); if (!SLAVE_AD_INFO(slave)) { kobject_put(&slave->kobj); return NULL; } } return slave; } static void bond_fill_ifbond(struct bonding *bond, struct ifbond *info) { info->bond_mode = BOND_MODE(bond); info->miimon = bond->params.miimon; info->num_slaves = bond->slave_cnt; } static void bond_fill_ifslave(struct slave *slave, struct ifslave *info) { strcpy(info->slave_name, slave->dev->name); info->link = slave->link; info->state = bond_slave_state(slave); info->link_failure_count = slave->link_failure_count; } static void bond_netdev_notify_work(struct work_struct *_work) { struct slave *slave = container_of(_work, struct slave, notify_work.work); if (rtnl_trylock()) { struct netdev_bonding_info binfo; bond_fill_ifslave(slave, &binfo.slave); bond_fill_ifbond(slave->bond, &binfo.master); netdev_bonding_info_change(slave->dev, &binfo); rtnl_unlock(); } else { queue_delayed_work(slave->bond->wq, &slave->notify_work, 1); } } void bond_queue_slave_event(struct slave *slave) { queue_delayed_work(slave->bond->wq, &slave->notify_work, 0); } void bond_lower_state_changed(struct slave *slave) { struct netdev_lag_lower_state_info info; info.link_up = slave->link == BOND_LINK_UP || slave->link == BOND_LINK_FAIL; info.tx_enabled = bond_is_active_slave(slave); netdev_lower_state_changed(slave->dev, &info); } #define BOND_NL_ERR(bond_dev, extack, errmsg) do { \ if (extack) \ NL_SET_ERR_MSG(extack, errmsg); \ else \ netdev_err(bond_dev, "Error: %s\n", errmsg); \ } while (0) #define SLAVE_NL_ERR(bond_dev, slave_dev, extack, errmsg) do { \ if (extack) \ NL_SET_ERR_MSG(extack, errmsg); \ else \ slave_err(bond_dev, slave_dev, "Error: %s\n", errmsg); \ } while (0) /* The bonding driver uses ether_setup() to convert a master bond device * to ARPHRD_ETHER, that resets the target netdevice's flags so we always * have to restore the IFF_MASTER flag, and only restore IFF_SLAVE and IFF_UP * if they were set */ static void bond_ether_setup(struct net_device *bond_dev) { unsigned int flags = bond_dev->flags & (IFF_SLAVE | IFF_UP); ether_setup(bond_dev); bond_dev->flags |= IFF_MASTER | flags; bond_dev->priv_flags &= ~IFF_TX_SKB_SHARING; } void bond_xdp_set_features(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); xdp_features_t val = NETDEV_XDP_ACT_MASK; struct list_head *iter; struct slave *slave; ASSERT_RTNL(); if (!bond_xdp_check(bond, BOND_MODE(bond)) || !bond_has_slaves(bond)) { xdp_clear_features_flag(bond_dev); return; } bond_for_each_slave(bond, slave, iter) val &= slave->dev->xdp_features; val &= ~NETDEV_XDP_ACT_XSK_ZEROCOPY; xdp_set_features_flag(bond_dev, val); } /* enslave device <slave> to bond device <master> */ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(bond_dev); const struct net_device_ops *slave_ops = slave_dev->netdev_ops; struct slave *new_slave = NULL, *prev_slave; struct sockaddr_storage ss; int res = 0, i; if (slave_dev->flags & IFF_MASTER && !netif_is_bond_master(slave_dev)) { BOND_NL_ERR(bond_dev, extack, "Device type (master device) cannot be enslaved"); return -EPERM; } /* already in-use? */ if (netdev_is_rx_handler_busy(slave_dev)) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Device is in use and cannot be enslaved"); return -EBUSY; } if (bond_dev == slave_dev) { BOND_NL_ERR(bond_dev, extack, "Cannot enslave bond to itself."); return -EPERM; } /* vlan challenged mutual exclusion */ /* no need to lock since we're protected by rtnl_lock */ if (slave_dev->features & NETIF_F_VLAN_CHALLENGED) { slave_dbg(bond_dev, slave_dev, "is NETIF_F_VLAN_CHALLENGED\n"); if (vlan_uses_dev(bond_dev)) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Can not enslave VLAN challenged device to VLAN enabled bond"); return -EPERM; } else { slave_warn(bond_dev, slave_dev, "enslaved VLAN challenged slave. Adding VLANs will be blocked as long as it is part of bond.\n"); } } else { slave_dbg(bond_dev, slave_dev, "is !NETIF_F_VLAN_CHALLENGED\n"); } if (slave_dev->features & NETIF_F_HW_ESP) slave_dbg(bond_dev, slave_dev, "is esp-hw-offload capable\n"); /* Old ifenslave binaries are no longer supported. These can * be identified with moderate accuracy by the state of the slave: * the current ifenslave will set the interface down prior to * enslaving it; the old ifenslave will not. */ if (slave_dev->flags & IFF_UP) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Device can not be enslaved while up"); return -EPERM; } /* set bonding device ether type by slave - bonding netdevices are * created with ether_setup, so when the slave type is not ARPHRD_ETHER * there is a need to override some of the type dependent attribs/funcs. * * bond ether type mutual exclusion - don't allow slaves of dissimilar * ether type (eg ARPHRD_ETHER and ARPHRD_INFINIBAND) share the same bond */ if (!bond_has_slaves(bond)) { if (bond_dev->type != slave_dev->type) { slave_dbg(bond_dev, slave_dev, "change device type from %d to %d\n", bond_dev->type, slave_dev->type); res = call_netdevice_notifiers(NETDEV_PRE_TYPE_CHANGE, bond_dev); res = notifier_to_errno(res); if (res) { slave_err(bond_dev, slave_dev, "refused to change device type\n"); return -EBUSY; } /* Flush unicast and multicast addresses */ dev_uc_flush(bond_dev); dev_mc_flush(bond_dev); if (slave_dev->type != ARPHRD_ETHER) bond_setup_by_slave(bond_dev, slave_dev); else bond_ether_setup(bond_dev); call_netdevice_notifiers(NETDEV_POST_TYPE_CHANGE, bond_dev); } } else if (bond_dev->type != slave_dev->type) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Device type is different from other slaves"); return -EINVAL; } if (slave_dev->type == ARPHRD_INFINIBAND && BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Only active-backup mode is supported for infiniband slaves"); res = -EOPNOTSUPP; goto err_undo_flags; } if (!slave_ops->ndo_set_mac_address || slave_dev->type == ARPHRD_INFINIBAND) { slave_warn(bond_dev, slave_dev, "The slave device specified does not support setting the MAC address\n"); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && bond->params.fail_over_mac != BOND_FOM_ACTIVE) { if (!bond_has_slaves(bond)) { bond->params.fail_over_mac = BOND_FOM_ACTIVE; slave_warn(bond_dev, slave_dev, "Setting fail_over_mac to active for active-backup mode\n"); } else { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Slave device does not support setting the MAC address, but fail_over_mac is not set to active"); res = -EOPNOTSUPP; goto err_undo_flags; } } } call_netdevice_notifiers(NETDEV_JOIN, slave_dev); /* If this is the first slave, then we need to set the master's hardware * address to be the same as the slave's. */ if (!bond_has_slaves(bond) && bond->dev->addr_assign_type == NET_ADDR_RANDOM) { res = bond_set_dev_addr(bond->dev, slave_dev); if (res) goto err_undo_flags; } new_slave = bond_alloc_slave(bond, slave_dev); if (!new_slave) { res = -ENOMEM; goto err_undo_flags; } /* Set the new_slave's queue_id to be zero. Queue ID mapping * is set via sysfs or module option if desired. */ new_slave->queue_id = 0; /* Save slave's original mtu and then set it to match the bond */ new_slave->original_mtu = slave_dev->mtu; res = dev_set_mtu(slave_dev, bond->dev->mtu); if (res) { slave_err(bond_dev, slave_dev, "Error %d calling dev_set_mtu\n", res); goto err_free; } /* Save slave's original ("permanent") mac address for modes * that need it, and for restoring it upon release, and then * set it to the master's address */ bond_hw_addr_copy(new_slave->perm_hwaddr, slave_dev->dev_addr, slave_dev->addr_len); if (!bond->params.fail_over_mac || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* Set slave to master's mac address. The application already * set the master's mac address to that of the first slave */ memcpy(ss.__data, bond_dev->dev_addr, bond_dev->addr_len); } else if (bond->params.fail_over_mac == BOND_FOM_FOLLOW && BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && bond_has_slaves(bond) && memcmp(slave_dev->dev_addr, bond_dev->dev_addr, bond_dev->addr_len) == 0) { /* Set slave to random address to avoid duplicate mac * address in later fail over. */ eth_random_addr(ss.__data); } else { goto skip_mac_set; } ss.ss_family = slave_dev->type; res = dev_set_mac_address(slave_dev, &ss, extack); if (res) { slave_err(bond_dev, slave_dev, "Error %d calling set_mac_address\n", res); goto err_restore_mtu; } skip_mac_set: /* set no_addrconf flag before open to prevent IPv6 addrconf */ slave_dev->priv_flags |= IFF_NO_ADDRCONF; /* open the slave since the application closed it */ res = dev_open(slave_dev, extack); if (res) { slave_err(bond_dev, slave_dev, "Opening slave failed\n"); goto err_restore_mac; } slave_dev->priv_flags |= IFF_BONDING; /* initialize slave stats */ dev_get_stats(new_slave->dev, &new_slave->slave_stats); if (bond_is_lb(bond)) { /* bond_alb_init_slave() must be called before all other stages since * it might fail and we do not want to have to undo everything */ res = bond_alb_init_slave(bond, new_slave); if (res) goto err_close; } res = vlan_vids_add_by_dev(slave_dev, bond_dev); if (res) { slave_err(bond_dev, slave_dev, "Couldn't add bond vlan ids\n"); goto err_close; } prev_slave = bond_last_slave(bond); new_slave->delay = 0; new_slave->link_failure_count = 0; if (bond_update_speed_duplex(new_slave) && bond_needs_speed_duplex(bond)) new_slave->link = BOND_LINK_DOWN; new_slave->last_rx = jiffies - (msecs_to_jiffies(bond->params.arp_interval) + 1); for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) new_slave->target_last_arp_rx[i] = new_slave->last_rx; new_slave->last_tx = new_slave->last_rx; /* check for initial state */ new_slave->link = BOND_LINK_NOCHANGE; if (bond->params.miimon) { if (netif_carrier_ok(slave_dev)) { if (bond->params.updelay) { bond_set_slave_link_state(new_slave, BOND_LINK_BACK, BOND_SLAVE_NOTIFY_NOW); new_slave->delay = bond->params.updelay; } else { bond_set_slave_link_state(new_slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); } } else { bond_set_slave_link_state(new_slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_NOW); } } else if (bond->params.arp_interval) { bond_set_slave_link_state(new_slave, (netif_carrier_ok(slave_dev) ? BOND_LINK_UP : BOND_LINK_DOWN), BOND_SLAVE_NOTIFY_NOW); } else { bond_set_slave_link_state(new_slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); } if (new_slave->link != BOND_LINK_DOWN) new_slave->last_link_up = jiffies; slave_dbg(bond_dev, slave_dev, "Initial state of slave is BOND_LINK_%s\n", new_slave->link == BOND_LINK_DOWN ? "DOWN" : (new_slave->link == BOND_LINK_UP ? "UP" : "BACK")); if (bond_uses_primary(bond) && bond->params.primary[0]) { /* if there is a primary slave, remember it */ if (strcmp(bond->params.primary, new_slave->dev->name) == 0) { rcu_assign_pointer(bond->primary_slave, new_slave); bond->force_primary = true; } } switch (BOND_MODE(bond)) { case BOND_MODE_ACTIVEBACKUP: bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); break; case BOND_MODE_8023AD: /* in 802.3ad mode, the internal mechanism * will activate the slaves in the selected * aggregator */ bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); /* if this is the first slave */ if (!prev_slave) { SLAVE_AD_INFO(new_slave)->id = 1; /* Initialize AD with the number of times that the AD timer is called in 1 second * can be called only after the mac address of the bond is set */ bond_3ad_initialize(bond); } else { SLAVE_AD_INFO(new_slave)->id = SLAVE_AD_INFO(prev_slave)->id + 1; } bond_3ad_bind_slave(new_slave); break; case BOND_MODE_TLB: case BOND_MODE_ALB: bond_set_active_slave(new_slave); bond_set_slave_inactive_flags(new_slave, BOND_SLAVE_NOTIFY_NOW); break; default: slave_dbg(bond_dev, slave_dev, "This slave is always active in trunk mode\n"); /* always active in trunk mode */ bond_set_active_slave(new_slave); /* In trunking mode there is little meaning to curr_active_slave * anyway (it holds no special properties of the bond device), * so we can change it without calling change_active_interface() */ if (!rcu_access_pointer(bond->curr_active_slave) && new_slave->link == BOND_LINK_UP) rcu_assign_pointer(bond->curr_active_slave, new_slave); break; } /* switch(bond_mode) */ #ifdef CONFIG_NET_POLL_CONTROLLER if (bond->dev->npinfo) { if (slave_enable_netpoll(new_slave)) { slave_info(bond_dev, slave_dev, "master_dev is using netpoll, but new slave device does not support netpoll\n"); res = -EBUSY; goto err_detach; } } #endif if (!(bond_dev->features & NETIF_F_LRO)) dev_disable_lro(slave_dev); res = netdev_rx_handler_register(slave_dev, bond_handle_frame, new_slave); if (res) { slave_dbg(bond_dev, slave_dev, "Error %d calling netdev_rx_handler_register\n", res); goto err_detach; } res = bond_master_upper_dev_link(bond, new_slave, extack); if (res) { slave_dbg(bond_dev, slave_dev, "Error %d calling bond_master_upper_dev_link\n", res); goto err_unregister; } bond_lower_state_changed(new_slave); res = bond_sysfs_slave_add(new_slave); if (res) { slave_dbg(bond_dev, slave_dev, "Error %d calling bond_sysfs_slave_add\n", res); goto err_upper_unlink; } /* If the mode uses primary, then the following is handled by * bond_change_active_slave(). */ if (!bond_uses_primary(bond)) { /* set promiscuity level to new slave */ if (bond_dev->flags & IFF_PROMISC) { res = dev_set_promiscuity(slave_dev, 1); if (res) goto err_sysfs_del; } /* set allmulti level to new slave */ if (bond_dev->flags & IFF_ALLMULTI) { res = dev_set_allmulti(slave_dev, 1); if (res) { if (bond_dev->flags & IFF_PROMISC) dev_set_promiscuity(slave_dev, -1); goto err_sysfs_del; } } if (bond_dev->flags & IFF_UP) { netif_addr_lock_bh(bond_dev); dev_mc_sync_multiple(slave_dev, bond_dev); dev_uc_sync_multiple(slave_dev, bond_dev); netif_addr_unlock_bh(bond_dev); if (BOND_MODE(bond) == BOND_MODE_8023AD) dev_mc_add(slave_dev, lacpdu_mcast_addr); } } bond->slave_cnt++; bond_compute_features(bond); bond_set_carrier(bond); /* Needs to be called before bond_select_active_slave(), which will * remove the maddrs if the slave is selected as active slave. */ bond_slave_ns_maddrs_add(bond, new_slave); if (bond_uses_primary(bond)) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); if (!slave_dev->netdev_ops->ndo_bpf || !slave_dev->netdev_ops->ndo_xdp_xmit) { if (bond->xdp_prog) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Slave does not support XDP"); res = -EOPNOTSUPP; goto err_sysfs_del; } } else if (bond->xdp_prog) { struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, .prog = bond->xdp_prog, .extack = extack, }; if (dev_xdp_prog_count(slave_dev) > 0) { SLAVE_NL_ERR(bond_dev, slave_dev, extack, "Slave has XDP program loaded, please unload before enslaving"); res = -EOPNOTSUPP; goto err_sysfs_del; } res = dev_xdp_propagate(slave_dev, &xdp); if (res < 0) { /* ndo_bpf() sets extack error message */ slave_dbg(bond_dev, slave_dev, "Error %d calling ndo_bpf\n", res); goto err_sysfs_del; } if (bond->xdp_prog) bpf_prog_inc(bond->xdp_prog); } bond_xdp_set_features(bond_dev); slave_info(bond_dev, slave_dev, "Enslaving as %s interface with %s link\n", bond_is_active_slave(new_slave) ? "an active" : "a backup", new_slave->link != BOND_LINK_DOWN ? "an up" : "a down"); /* enslave is successful */ bond_queue_slave_event(new_slave); return 0; /* Undo stages on error */ err_sysfs_del: bond_sysfs_slave_del(new_slave); err_upper_unlink: bond_upper_dev_unlink(bond, new_slave); err_unregister: netdev_rx_handler_unregister(slave_dev); err_detach: vlan_vids_del_by_dev(slave_dev, bond_dev); if (rcu_access_pointer(bond->primary_slave) == new_slave) RCU_INIT_POINTER(bond->primary_slave, NULL); if (rcu_access_pointer(bond->curr_active_slave) == new_slave) { block_netpoll_tx(); bond_change_active_slave(bond, NULL); bond_select_active_slave(bond); unblock_netpoll_tx(); } /* either primary_slave or curr_active_slave might've changed */ synchronize_rcu(); slave_disable_netpoll(new_slave); err_close: if (!netif_is_bond_master(slave_dev)) slave_dev->priv_flags &= ~IFF_BONDING; dev_close(slave_dev); err_restore_mac: slave_dev->priv_flags &= ~IFF_NO_ADDRCONF; if (!bond->params.fail_over_mac || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* XXX TODO - fom follow mode needs to change master's * MAC if this slave's MAC is in use by the bond, or at * least print a warning. */ bond_hw_addr_copy(ss.__data, new_slave->perm_hwaddr, new_slave->dev->addr_len); ss.ss_family = slave_dev->type; dev_set_mac_address(slave_dev, &ss, NULL); } err_restore_mtu: dev_set_mtu(slave_dev, new_slave->original_mtu); err_free: kobject_put(&new_slave->kobj); err_undo_flags: /* Enslave of first slave has failed and we need to fix master's mac */ if (!bond_has_slaves(bond)) { if (ether_addr_equal_64bits(bond_dev->dev_addr, slave_dev->dev_addr)) eth_hw_addr_random(bond_dev); if (bond_dev->type != ARPHRD_ETHER) { dev_close(bond_dev); bond_ether_setup(bond_dev); } } return res; } /* Try to release the slave device <slave> from the bond device <master> * It is legal to access curr_active_slave without a lock because all the function * is RTNL-locked. If "all" is true it means that the function is being called * while destroying a bond interface and all slaves are being released. * * The rules for slave state should be: * for Active/Backup: * Active stays on all backups go down * for Bonded connections: * The first up interface should be left on and all others downed. */ static int __bond_release_one(struct net_device *bond_dev, struct net_device *slave_dev, bool all, bool unregister) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *oldcurrent; struct sockaddr_storage ss; int old_flags = bond_dev->flags; netdev_features_t old_features = bond_dev->features; /* slave is not a slave or master is not master of this slave */ if (!(slave_dev->flags & IFF_SLAVE) || !netdev_has_upper_dev(slave_dev, bond_dev)) { slave_dbg(bond_dev, slave_dev, "cannot release slave\n"); return -EINVAL; } block_netpoll_tx(); slave = bond_get_slave_by_dev(bond, slave_dev); if (!slave) { /* not a slave of this bond */ slave_info(bond_dev, slave_dev, "interface not enslaved\n"); unblock_netpoll_tx(); return -EINVAL; } bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); bond_sysfs_slave_del(slave); /* recompute stats just before removing the slave */ bond_get_stats(bond->dev, &bond->bond_stats); if (bond->xdp_prog) { struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, .prog = NULL, .extack = NULL, }; if (dev_xdp_propagate(slave_dev, &xdp)) slave_warn(bond_dev, slave_dev, "failed to unload XDP program\n"); } /* unregister rx_handler early so bond_handle_frame wouldn't be called * for this slave anymore. */ netdev_rx_handler_unregister(slave_dev); if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_unbind_slave(slave); bond_upper_dev_unlink(bond, slave); if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, slave); slave_info(bond_dev, slave_dev, "Releasing %s interface\n", bond_is_active_slave(slave) ? "active" : "backup"); oldcurrent = rcu_access_pointer(bond->curr_active_slave); RCU_INIT_POINTER(bond->current_arp_slave, NULL); if (!all && (bond->params.fail_over_mac != BOND_FOM_ACTIVE || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) { if (ether_addr_equal_64bits(bond_dev->dev_addr, slave->perm_hwaddr) && bond_has_slaves(bond)) slave_warn(bond_dev, slave_dev, "the permanent HWaddr of slave - %pM - is still in use by bond - set the HWaddr of slave to a different address to avoid conflicts\n", slave->perm_hwaddr); } if (rtnl_dereference(bond->primary_slave) == slave) RCU_INIT_POINTER(bond->primary_slave, NULL); if (oldcurrent == slave) bond_change_active_slave(bond, NULL); /* Must be called after bond_change_active_slave () as the slave * might change from an active slave to a backup slave. Then it is * necessary to clear the maddrs on the backup slave. */ bond_slave_ns_maddrs_del(bond, slave); if (bond_is_lb(bond)) { /* Must be called only after the slave has been * detached from the list and the curr_active_slave * has been cleared (if our_slave == old_current), * but before a new active slave is selected. */ bond_alb_deinit_slave(bond, slave); } if (all) { RCU_INIT_POINTER(bond->curr_active_slave, NULL); } else if (oldcurrent == slave) { /* Note that we hold RTNL over this sequence, so there * is no concern that another slave add/remove event * will interfere. */ bond_select_active_slave(bond); } bond_set_carrier(bond); if (!bond_has_slaves(bond)) eth_hw_addr_random(bond_dev); unblock_netpoll_tx(); synchronize_rcu(); bond->slave_cnt--; if (!bond_has_slaves(bond)) { call_netdevice_notifiers(NETDEV_CHANGEADDR, bond->dev); call_netdevice_notifiers(NETDEV_RELEASE, bond->dev); } bond_compute_features(bond); if (!(bond_dev->features & NETIF_F_VLAN_CHALLENGED) && (old_features & NETIF_F_VLAN_CHALLENGED)) slave_info(bond_dev, slave_dev, "last VLAN challenged slave left bond - VLAN blocking is removed\n"); vlan_vids_del_by_dev(slave_dev, bond_dev); /* If the mode uses primary, then this case was handled above by * bond_change_active_slave(..., NULL) */ if (!bond_uses_primary(bond)) { /* unset promiscuity level from slave * NOTE: The NETDEV_CHANGEADDR call above may change the value * of the IFF_PROMISC flag in the bond_dev, but we need the * value of that flag before that change, as that was the value * when this slave was attached, so we cache at the start of the * function and use it here. Same goes for ALLMULTI below */ if (old_flags & IFF_PROMISC) dev_set_promiscuity(slave_dev, -1); /* unset allmulti level from slave */ if (old_flags & IFF_ALLMULTI) dev_set_allmulti(slave_dev, -1); if (old_flags & IFF_UP) bond_hw_addr_flush(bond_dev, slave_dev); } slave_disable_netpoll(slave); /* close slave before restoring its mac address */ dev_close(slave_dev); slave_dev->priv_flags &= ~IFF_NO_ADDRCONF; if (bond->params.fail_over_mac != BOND_FOM_ACTIVE || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* restore original ("permanent") mac address */ bond_hw_addr_copy(ss.__data, slave->perm_hwaddr, slave->dev->addr_len); ss.ss_family = slave_dev->type; dev_set_mac_address(slave_dev, &ss, NULL); } if (unregister) { netdev_lock_ops(slave_dev); __netif_set_mtu(slave_dev, slave->original_mtu); netdev_unlock_ops(slave_dev); } else { dev_set_mtu(slave_dev, slave->original_mtu); } if (!netif_is_bond_master(slave_dev)) slave_dev->priv_flags &= ~IFF_BONDING; bond_xdp_set_features(bond_dev); kobject_put(&slave->kobj); return 0; } /* A wrapper used because of ndo_del_link */ int bond_release(struct net_device *bond_dev, struct net_device *slave_dev) { return __bond_release_one(bond_dev, slave_dev, false, false); } /* First release a slave and then destroy the bond if no more slaves are left. * Must be under rtnl_lock when this function is called. */ static int bond_release_and_destroy(struct net_device *bond_dev, struct net_device *slave_dev) { struct bonding *bond = netdev_priv(bond_dev); int ret; ret = __bond_release_one(bond_dev, slave_dev, false, true); if (ret == 0 && !bond_has_slaves(bond) && bond_dev->reg_state != NETREG_UNREGISTERING) { bond_dev->priv_flags |= IFF_DISABLE_NETPOLL; netdev_info(bond_dev, "Destroying bond\n"); bond_remove_proc_entry(bond); unregister_netdevice(bond_dev); } return ret; } static void bond_info_query(struct net_device *bond_dev, struct ifbond *info) { struct bonding *bond = netdev_priv(bond_dev); bond_fill_ifbond(bond, info); } static int bond_slave_info_query(struct net_device *bond_dev, struct ifslave *info) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; int i = 0, res = -ENODEV; struct slave *slave; bond_for_each_slave(bond, slave, iter) { if (i++ == (int)info->slave_id) { res = 0; bond_fill_ifslave(slave, info); break; } } return res; } /*-------------------------------- Monitoring -------------------------------*/ /* called with rcu_read_lock() */ static int bond_miimon_inspect(struct bonding *bond) { bool ignore_updelay = false; int link_state, commit = 0; struct list_head *iter; struct slave *slave; if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) { ignore_updelay = !rcu_dereference(bond->curr_active_slave); } else { struct bond_up_slave *usable_slaves; usable_slaves = rcu_dereference(bond->usable_slaves); if (usable_slaves && usable_slaves->count == 0) ignore_updelay = true; } bond_for_each_slave_rcu(bond, slave, iter) { bond_propose_link_state(slave, BOND_LINK_NOCHANGE); link_state = netif_carrier_ok(slave->dev); switch (slave->link) { case BOND_LINK_UP: if (link_state) continue; bond_propose_link_state(slave, BOND_LINK_FAIL); commit++; slave->delay = bond->params.downdelay; if (slave->delay && net_ratelimit()) { slave_info(bond->dev, slave->dev, "link status down for %sinterface, disabling it in %d ms\n", (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) ? (bond_is_active_slave(slave) ? "active " : "backup ") : "", bond->params.downdelay * bond->params.miimon); } fallthrough; case BOND_LINK_FAIL: if (link_state) { /* recovered before downdelay expired */ bond_propose_link_state(slave, BOND_LINK_UP); slave->last_link_up = jiffies; if (net_ratelimit()) slave_info(bond->dev, slave->dev, "link status up again after %d ms\n", (bond->params.downdelay - slave->delay) * bond->params.miimon); commit++; continue; } if (slave->delay <= 0) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; continue; } slave->delay--; break; case BOND_LINK_DOWN: if (!link_state) continue; bond_propose_link_state(slave, BOND_LINK_BACK); commit++; slave->delay = bond->params.updelay; if (slave->delay && net_ratelimit()) { slave_info(bond->dev, slave->dev, "link status up, enabling it in %d ms\n", ignore_updelay ? 0 : bond->params.updelay * bond->params.miimon); } fallthrough; case BOND_LINK_BACK: if (!link_state) { bond_propose_link_state(slave, BOND_LINK_DOWN); if (net_ratelimit()) slave_info(bond->dev, slave->dev, "link status down again after %d ms\n", (bond->params.updelay - slave->delay) * bond->params.miimon); commit++; continue; } if (ignore_updelay) slave->delay = 0; if (slave->delay <= 0) { bond_propose_link_state(slave, BOND_LINK_UP); commit++; ignore_updelay = false; continue; } slave->delay--; break; } } return commit; } static void bond_miimon_link_change(struct bonding *bond, struct slave *slave, char link) { switch (BOND_MODE(bond)) { case BOND_MODE_8023AD: bond_3ad_handle_link_change(slave, link); break; case BOND_MODE_TLB: case BOND_MODE_ALB: bond_alb_handle_link_change(bond, slave, link); break; case BOND_MODE_XOR: bond_update_slave_arr(bond, NULL); break; } } static void bond_miimon_commit(struct bonding *bond) { struct slave *slave, *primary, *active; bool do_failover = false; struct list_head *iter; ASSERT_RTNL(); bond_for_each_slave(bond, slave, iter) { switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: /* For 802.3ad mode, check current slave speed and * duplex again in case its port was disabled after * invalid speed/duplex reporting but recovered before * link monitoring could make a decision on the actual * link status */ if (BOND_MODE(bond) == BOND_MODE_8023AD && slave->link == BOND_LINK_UP) bond_3ad_adapter_speed_duplex_changed(slave); continue; case BOND_LINK_UP: if (bond_update_speed_duplex(slave) && bond_needs_speed_duplex(bond)) { slave->link = BOND_LINK_DOWN; if (net_ratelimit()) slave_warn(bond->dev, slave->dev, "failed to get link speed/duplex\n"); continue; } bond_set_slave_link_state(slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); slave->last_link_up = jiffies; primary = rtnl_dereference(bond->primary_slave); if (BOND_MODE(bond) == BOND_MODE_8023AD) { /* prevent it from being the active one */ bond_set_backup_slave(slave); } else if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) { /* make it immediately active */ bond_set_active_slave(slave); } slave_info(bond->dev, slave->dev, "link status definitely up, %u Mbps %s duplex\n", slave->speed == SPEED_UNKNOWN ? 0 : slave->speed, slave->duplex ? "full" : "half"); bond_miimon_link_change(bond, slave, BOND_LINK_UP); active = rtnl_dereference(bond->curr_active_slave); if (!active || slave == primary || slave->prio > active->prio) do_failover = true; continue; case BOND_LINK_DOWN: if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; bond_set_slave_link_state(slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_NOW); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP || BOND_MODE(bond) == BOND_MODE_8023AD) bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n"); bond_miimon_link_change(bond, slave, BOND_LINK_DOWN); if (slave == rcu_access_pointer(bond->curr_active_slave)) do_failover = true; continue; default: slave_err(bond->dev, slave->dev, "invalid new link %d on slave\n", slave->link_new_state); bond_propose_link_state(slave, BOND_LINK_NOCHANGE); continue; } } if (do_failover) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } bond_set_carrier(bond); } /* bond_mii_monitor * * Really a wrapper that splits the mii monitor into two phases: an * inspection, then (if inspection indicates something needs to be done) * an acquisition of appropriate locks followed by a commit phase to * implement whatever link state changes are indicated. */ static void bond_mii_monitor(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, mii_work.work); bool should_notify_peers = false; bool commit; unsigned long delay; struct slave *slave; struct list_head *iter; delay = msecs_to_jiffies(bond->params.miimon); if (!bond_has_slaves(bond)) goto re_arm; rcu_read_lock(); should_notify_peers = bond_should_notify_peers(bond); commit = !!bond_miimon_inspect(bond); if (bond->send_peer_notif) { rcu_read_unlock(); if (rtnl_trylock()) { bond->send_peer_notif--; rtnl_unlock(); } } else { rcu_read_unlock(); } if (commit) { /* Race avoidance with bond_close cancel of workqueue */ if (!rtnl_trylock()) { delay = 1; should_notify_peers = false; goto re_arm; } bond_for_each_slave(bond, slave, iter) { bond_commit_link_state(slave, BOND_SLAVE_NOTIFY_LATER); } bond_miimon_commit(bond); rtnl_unlock(); /* might sleep, hold no other locks */ } re_arm: if (bond->params.miimon) queue_delayed_work(bond->wq, &bond->mii_work, delay); if (should_notify_peers) { if (!rtnl_trylock()) return; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); rtnl_unlock(); } } static int bond_upper_dev_walk(struct net_device *upper, struct netdev_nested_priv *priv) { __be32 ip = *(__be32 *)priv->data; return ip == bond_confirm_addr(upper, 0, ip); } static bool bond_has_this_ip(struct bonding *bond, __be32 ip) { struct netdev_nested_priv priv = { .data = (void *)&ip, }; bool ret = false; if (ip == bond_confirm_addr(bond->dev, 0, ip)) return true; rcu_read_lock(); if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_upper_dev_walk, &priv)) ret = true; rcu_read_unlock(); return ret; } #define BOND_VLAN_PROTO_NONE cpu_to_be16(0xffff) static bool bond_handle_vlan(struct slave *slave, struct bond_vlan_tag *tags, struct sk_buff *skb) { struct net_device *bond_dev = slave->bond->dev; struct net_device *slave_dev = slave->dev; struct bond_vlan_tag *outer_tag = tags; if (!tags || tags->vlan_proto == BOND_VLAN_PROTO_NONE) return true; tags++; /* Go through all the tags backwards and add them to the packet */ while (tags->vlan_proto != BOND_VLAN_PROTO_NONE) { if (!tags->vlan_id) { tags++; continue; } slave_dbg(bond_dev, slave_dev, "inner tag: proto %X vid %X\n", ntohs(outer_tag->vlan_proto), tags->vlan_id); skb = vlan_insert_tag_set_proto(skb, tags->vlan_proto, tags->vlan_id); if (!skb) { net_err_ratelimited("failed to insert inner VLAN tag\n"); return false; } tags++; } /* Set the outer tag */ if (outer_tag->vlan_id) { slave_dbg(bond_dev, slave_dev, "outer tag: proto %X vid %X\n", ntohs(outer_tag->vlan_proto), outer_tag->vlan_id); __vlan_hwaccel_put_tag(skb, outer_tag->vlan_proto, outer_tag->vlan_id); } return true; } /* We go to the (large) trouble of VLAN tagging ARP frames because * switches in VLAN mode (especially if ports are configured as * "native" to a VLAN) might not pass non-tagged frames. */ static void bond_arp_send(struct slave *slave, int arp_op, __be32 dest_ip, __be32 src_ip, struct bond_vlan_tag *tags) { struct net_device *bond_dev = slave->bond->dev; struct net_device *slave_dev = slave->dev; struct sk_buff *skb; slave_dbg(bond_dev, slave_dev, "arp %d on slave: dst %pI4 src %pI4\n", arp_op, &dest_ip, &src_ip); skb = arp_create(arp_op, ETH_P_ARP, dest_ip, slave_dev, src_ip, NULL, slave_dev->dev_addr, NULL); if (!skb) { net_err_ratelimited("ARP packet allocation failed\n"); return; } if (bond_handle_vlan(slave, tags, skb)) { slave_update_last_tx(slave); arp_xmit(skb); } return; } /* Validate the device path between the @start_dev and the @end_dev. * The path is valid if the @end_dev is reachable through device * stacking. * When the path is validated, collect any vlan information in the * path. */ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, struct net_device *end_dev, int level) { struct bond_vlan_tag *tags; struct net_device *upper; struct list_head *iter; if (start_dev == end_dev) { tags = kcalloc(level + 1, sizeof(*tags), GFP_ATOMIC); if (!tags) return ERR_PTR(-ENOMEM); tags[level].vlan_proto = BOND_VLAN_PROTO_NONE; return tags; } netdev_for_each_upper_dev_rcu(start_dev, upper, iter) { tags = bond_verify_device_path(upper, end_dev, level + 1); if (IS_ERR_OR_NULL(tags)) { if (IS_ERR(tags)) return tags; continue; } if (is_vlan_dev(upper)) { tags[level].vlan_proto = vlan_dev_vlan_proto(upper); tags[level].vlan_id = vlan_dev_vlan_id(upper); } return tags; } return NULL; } static void bond_arp_send_all(struct bonding *bond, struct slave *slave) { struct rtable *rt; struct bond_vlan_tag *tags; __be32 *targets = bond->params.arp_targets, addr; int i; for (i = 0; i < BOND_MAX_ARP_TARGETS && targets[i]; i++) { slave_dbg(bond->dev, slave->dev, "%s: target %pI4\n", __func__, &targets[i]); tags = NULL; /* Find out through which dev should the packet go */ rt = ip_route_output(dev_net(bond->dev), targets[i], 0, 0, 0, RT_SCOPE_LINK); if (IS_ERR(rt)) { /* there's no route to target - try to send arp * probe to generate any traffic (arp_validate=0) */ if (bond->params.arp_validate) pr_warn_once("%s: no route to arp_ip_target %pI4 and arp_validate is set\n", bond->dev->name, &targets[i]); bond_arp_send(slave, ARPOP_REQUEST, targets[i], 0, tags); continue; } /* bond device itself */ if (rt->dst.dev == bond->dev) goto found; rcu_read_lock(); tags = bond_verify_device_path(bond->dev, rt->dst.dev, 0); rcu_read_unlock(); if (!IS_ERR_OR_NULL(tags)) goto found; /* Not our device - skip */ slave_dbg(bond->dev, slave->dev, "no path to arp_ip_target %pI4 via rt.dev %s\n", &targets[i], rt->dst.dev ? rt->dst.dev->name : "NULL"); ip_rt_put(rt); continue; found: addr = bond_confirm_addr(rt->dst.dev, targets[i], 0); ip_rt_put(rt); bond_arp_send(slave, ARPOP_REQUEST, targets[i], addr, tags); kfree(tags); } } static void bond_validate_arp(struct bonding *bond, struct slave *slave, __be32 sip, __be32 tip) { int i; if (!sip || !bond_has_this_ip(bond, tip)) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 tip %pI4 not found\n", __func__, &sip, &tip); return; } i = bond_get_targets_ip(bond->params.arp_targets, sip); if (i == -1) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI4 not found in targets\n", __func__, &sip); return; } slave->last_rx = jiffies; slave->target_last_arp_rx[i] = jiffies; } static int bond_arp_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct arphdr *arp = (struct arphdr *)skb->data; struct slave *curr_active_slave, *curr_arp_slave; unsigned char *arp_ptr; __be32 sip, tip; unsigned int alen; alen = arp_hdr_len(bond->dev); if (alen > skb_headlen(skb)) { arp = kmalloc(alen, GFP_ATOMIC); if (!arp) goto out_unlock; if (skb_copy_bits(skb, 0, arp, alen) < 0) goto out_unlock; } if (arp->ar_hln != bond->dev->addr_len || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK || arp->ar_hrd != htons(ARPHRD_ETHER) || arp->ar_pro != htons(ETH_P_IP) || arp->ar_pln != 4) goto out_unlock; arp_ptr = (unsigned char *)(arp + 1); arp_ptr += bond->dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4 + bond->dev->addr_len; memcpy(&tip, arp_ptr, 4); slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI4 tip %pI4\n", __func__, slave->dev->name, bond_slave_state(slave), bond->params.arp_validate, slave_do_arp_validate(bond, slave), &sip, &tip); curr_active_slave = rcu_dereference(bond->curr_active_slave); curr_arp_slave = rcu_dereference(bond->current_arp_slave); /* We 'trust' the received ARP enough to validate it if: * * (a) the slave receiving the ARP is active (which includes the * current ARP slave, if any), or * * (b) the receiving slave isn't active, but there is a currently * active slave and it received valid arp reply(s) after it became * the currently active slave, or * * (c) there is an ARP slave that sent an ARP during the prior ARP * interval, and we receive an ARP reply on any slave. We accept * these because switch FDB update delays may deliver the ARP * reply to a slave other than the sender of the ARP request. * * Note: for (b), backup slaves are receiving the broadcast ARP * request, not a reply. This request passes from the sending * slave through the L2 switch(es) to the receiving slave. Since * this is checking the request, sip/tip are swapped for * validation. * * This is done to avoid endless looping when we can't reach the * arp_ip_target and fool ourselves with our own arp requests. */ if (bond_is_active_slave(slave)) bond_validate_arp(bond, slave, sip, tip); else if (curr_active_slave && time_after(slave_last_rx(bond, curr_active_slave), curr_active_slave->last_link_up)) bond_validate_arp(bond, slave, tip, sip); else if (curr_arp_slave && (arp->ar_op == htons(ARPOP_REPLY)) && bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) bond_validate_arp(bond, slave, sip, tip); out_unlock: if (arp != (struct arphdr *)skb->data) kfree(arp); return RX_HANDLER_ANOTHER; } #if IS_ENABLED(CONFIG_IPV6) static void bond_ns_send(struct slave *slave, const struct in6_addr *daddr, const struct in6_addr *saddr, struct bond_vlan_tag *tags) { struct net_device *bond_dev = slave->bond->dev; struct net_device *slave_dev = slave->dev; struct in6_addr mcaddr; struct sk_buff *skb; slave_dbg(bond_dev, slave_dev, "NS on slave: dst %pI6c src %pI6c\n", daddr, saddr); skb = ndisc_ns_create(slave_dev, daddr, saddr, 0); if (!skb) { net_err_ratelimited("NS packet allocation failed\n"); return; } addrconf_addr_solict_mult(daddr, &mcaddr); if (bond_handle_vlan(slave, tags, skb)) { slave_update_last_tx(slave); ndisc_send_skb(skb, &mcaddr, saddr); } } static void bond_ns_send_all(struct bonding *bond, struct slave *slave) { struct in6_addr *targets = bond->params.ns_targets; struct bond_vlan_tag *tags; struct dst_entry *dst; struct in6_addr saddr; struct flowi6 fl6; int i; for (i = 0; i < BOND_MAX_NS_TARGETS && !ipv6_addr_any(&targets[i]); i++) { slave_dbg(bond->dev, slave->dev, "%s: target %pI6c\n", __func__, &targets[i]); tags = NULL; /* Find out through which dev should the packet go */ memset(&fl6, 0, sizeof(struct flowi6)); fl6.daddr = targets[i]; dst = ip6_route_output(dev_net(bond->dev), NULL, &fl6); if (dst->error) { dst_release(dst); /* there's no route to target - try to send arp * probe to generate any traffic (arp_validate=0) */ if (bond->params.arp_validate) pr_warn_once("%s: no route to ns_ip6_target %pI6c and arp_validate is set\n", bond->dev->name, &targets[i]); bond_ns_send(slave, &targets[i], &in6addr_any, tags); continue; } /* bond device itself */ if (dst->dev == bond->dev) goto found; rcu_read_lock(); tags = bond_verify_device_path(bond->dev, dst->dev, 0); rcu_read_unlock(); if (!IS_ERR_OR_NULL(tags)) goto found; /* Not our device - skip */ slave_dbg(bond->dev, slave->dev, "no path to ns_ip6_target %pI6c via dst->dev %s\n", &targets[i], dst->dev ? dst->dev->name : "NULL"); dst_release(dst); continue; found: if (!ipv6_dev_get_saddr(dev_net(dst->dev), dst->dev, &targets[i], 0, &saddr)) bond_ns_send(slave, &targets[i], &saddr, tags); else bond_ns_send(slave, &targets[i], &in6addr_any, tags); dst_release(dst); kfree(tags); } } static int bond_confirm_addr6(struct net_device *dev, struct netdev_nested_priv *priv) { struct in6_addr *addr = (struct in6_addr *)priv->data; return ipv6_chk_addr(dev_net(dev), addr, dev, 0); } static bool bond_has_this_ip6(struct bonding *bond, struct in6_addr *addr) { struct netdev_nested_priv priv = { .data = addr, }; int ret = false; if (bond_confirm_addr6(bond->dev, &priv)) return true; rcu_read_lock(); if (netdev_walk_all_upper_dev_rcu(bond->dev, bond_confirm_addr6, &priv)) ret = true; rcu_read_unlock(); return ret; } static void bond_validate_na(struct bonding *bond, struct slave *slave, struct in6_addr *saddr, struct in6_addr *daddr) { int i; /* Ignore NAs that: * 1. Source address is unspecified address. * 2. Dest address is neither all-nodes multicast address nor * exist on bond interface. */ if (ipv6_addr_any(saddr) || (!ipv6_addr_equal(daddr, &in6addr_linklocal_allnodes) && !bond_has_this_ip6(bond, daddr))) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI6c tip %pI6c not found\n", __func__, saddr, daddr); return; } i = bond_get_targets_ip6(bond->params.ns_targets, saddr); if (i == -1) { slave_dbg(bond->dev, slave->dev, "%s: sip %pI6c not found in targets\n", __func__, saddr); return; } slave->last_rx = jiffies; slave->target_last_arp_rx[i] = jiffies; } static int bond_na_rcv(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { struct slave *curr_active_slave, *curr_arp_slave; struct in6_addr *saddr, *daddr; struct { struct ipv6hdr ip6; struct icmp6hdr icmp6; } *combined, _combined; if (skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK) goto out; combined = skb_header_pointer(skb, 0, sizeof(_combined), &_combined); if (!combined || combined->ip6.nexthdr != NEXTHDR_ICMP || (combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_SOLICITATION && combined->icmp6.icmp6_type != NDISC_NEIGHBOUR_ADVERTISEMENT)) goto out; saddr = &combined->ip6.saddr; daddr = &combined->ip6.daddr; slave_dbg(bond->dev, slave->dev, "%s: %s/%d av %d sv %d sip %pI6c tip %pI6c\n", __func__, slave->dev->name, bond_slave_state(slave), bond->params.arp_validate, slave_do_arp_validate(bond, slave), saddr, daddr); curr_active_slave = rcu_dereference(bond->curr_active_slave); curr_arp_slave = rcu_dereference(bond->current_arp_slave); /* We 'trust' the received ARP enough to validate it if: * see bond_arp_rcv(). */ if (bond_is_active_slave(slave)) bond_validate_na(bond, slave, saddr, daddr); else if (curr_active_slave && time_after(slave_last_rx(bond, curr_active_slave), curr_active_slave->last_link_up)) bond_validate_na(bond, slave, daddr, saddr); else if (curr_arp_slave && bond_time_in_interval(bond, slave_last_tx(curr_arp_slave), 1)) bond_validate_na(bond, slave, saddr, daddr); out: return RX_HANDLER_ANOTHER; } #endif int bond_rcv_validate(const struct sk_buff *skb, struct bonding *bond, struct slave *slave) { #if IS_ENABLED(CONFIG_IPV6) bool is_ipv6 = skb->protocol == __cpu_to_be16(ETH_P_IPV6); #endif bool is_arp = skb->protocol == __cpu_to_be16(ETH_P_ARP); slave_dbg(bond->dev, slave->dev, "%s: skb->dev %s\n", __func__, skb->dev->name); /* Use arp validate logic for both ARP and NS */ if (!slave_do_arp_validate(bond, slave)) { if ((slave_do_arp_validate_only(bond) && is_arp) || #if IS_ENABLED(CONFIG_IPV6) (slave_do_arp_validate_only(bond) && is_ipv6) || #endif !slave_do_arp_validate_only(bond)) slave->last_rx = jiffies; return RX_HANDLER_ANOTHER; } else if (is_arp) { return bond_arp_rcv(skb, bond, slave); #if IS_ENABLED(CONFIG_IPV6) } else if (is_ipv6) { return bond_na_rcv(skb, bond, slave); #endif } else { return RX_HANDLER_ANOTHER; } } static void bond_send_validate(struct bonding *bond, struct slave *slave) { bond_arp_send_all(bond, slave); #if IS_ENABLED(CONFIG_IPV6) bond_ns_send_all(bond, slave); #endif } /* function to verify if we're in the arp_interval timeslice, returns true if * (last_act - arp_interval) <= jiffies <= (last_act + mod * arp_interval + * arp_interval/2) . the arp_interval/2 is needed for really fast networks. */ static bool bond_time_in_interval(struct bonding *bond, unsigned long last_act, int mod) { int delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval); return time_in_range(jiffies, last_act - delta_in_ticks, last_act + mod * delta_in_ticks + delta_in_ticks/2); } /* This function is called regularly to monitor each slave's link * ensuring that traffic is being sent and received when arp monitoring * is used in load-balancing mode. if the adapter has been dormant, then an * arp is transmitted to generate traffic. see activebackup_arp_monitor for * arp monitoring in active backup mode. */ static void bond_loadbalance_arp_mon(struct bonding *bond) { struct slave *slave, *oldcurrent; struct list_head *iter; int do_failover = 0, slave_state_changed = 0; if (!bond_has_slaves(bond)) goto re_arm; rcu_read_lock(); oldcurrent = rcu_dereference(bond->curr_active_slave); /* see if any of the previous devices are up now (i.e. they have * xmt and rcv traffic). the curr_active_slave does not come into * the picture unless it is null. also, slave->last_link_up is not * needed here because we send an arp on each slave and give a slave * as long as it needs to get the tx/rx within the delta. * TODO: what about up/down delay in arp mode? it wasn't here before * so it can wait */ bond_for_each_slave_rcu(bond, slave, iter) { unsigned long last_tx = slave_last_tx(slave); bond_propose_link_state(slave, BOND_LINK_NOCHANGE); if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, last_tx, 1) && bond_time_in_interval(bond, slave->last_rx, 1)) { bond_propose_link_state(slave, BOND_LINK_UP); slave_state_changed = 1; /* primary_slave has no meaning in round-robin * mode. the window of a slave being up and * curr_active_slave being null after enslaving * is closed. */ if (!oldcurrent) { slave_info(bond->dev, slave->dev, "link status definitely up\n"); do_failover = 1; } else { slave_info(bond->dev, slave->dev, "interface is now up\n"); } } } else { /* slave->link == BOND_LINK_UP */ /* not all switches will respond to an arp request * when the source ip is 0, so don't take the link down * if we don't know our ip yet */ if (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) || !bond_time_in_interval(bond, slave->last_rx, bond->params.missed_max)) { bond_propose_link_state(slave, BOND_LINK_DOWN); slave_state_changed = 1; if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; slave_info(bond->dev, slave->dev, "interface is now down\n"); if (slave == oldcurrent) do_failover = 1; } } /* note: if switch is in round-robin mode, all links * must tx arp to ensure all links rx an arp - otherwise * links may oscillate or not come up at all; if switch is * in something like xor mode, there is nothing we can * do - all replies will be rx'ed on same link causing slaves * to be unstable during low/no traffic periods */ if (bond_slave_is_up(slave)) bond_send_validate(bond, slave); } rcu_read_unlock(); if (do_failover || slave_state_changed) { if (!rtnl_trylock()) goto re_arm; bond_for_each_slave(bond, slave, iter) { if (slave->link_new_state != BOND_LINK_NOCHANGE) slave->link = slave->link_new_state; } if (slave_state_changed) { bond_slave_state_change(bond); if (BOND_MODE(bond) == BOND_MODE_XOR) bond_update_slave_arr(bond, NULL); } if (do_failover) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } rtnl_unlock(); } re_arm: if (bond->params.arp_interval) queue_delayed_work(bond->wq, &bond->arp_work, msecs_to_jiffies(bond->params.arp_interval)); } /* Called to inspect slaves for active-backup mode ARP monitor link state * changes. Sets proposed link state in slaves to specify what action * should take place for the slave. Returns 0 if no changes are found, >0 * if changes to link states must be committed. * * Called with rcu_read_lock held. */ static int bond_ab_arp_inspect(struct bonding *bond) { unsigned long last_tx, last_rx; struct list_head *iter; struct slave *slave; int commit = 0; bond_for_each_slave_rcu(bond, slave, iter) { bond_propose_link_state(slave, BOND_LINK_NOCHANGE); last_rx = slave_last_rx(bond, slave); if (slave->link != BOND_LINK_UP) { if (bond_time_in_interval(bond, last_rx, 1)) { bond_propose_link_state(slave, BOND_LINK_UP); commit++; } else if (slave->link == BOND_LINK_BACK) { bond_propose_link_state(slave, BOND_LINK_FAIL); commit++; } continue; } /* Give slaves 2*delta after being enslaved or made * active. This avoids bouncing, as the last receive * times need a full ARP monitor cycle to be updated. */ if (bond_time_in_interval(bond, slave->last_link_up, 2)) continue; /* Backup slave is down if: * - No current_arp_slave AND * - more than (missed_max+1)*delta since last receive AND * - the bond has an IP address * * Note: a non-null current_arp_slave indicates * the curr_active_slave went down and we are * searching for a new one; under this condition * we only take the curr_active_slave down - this * gives each slave a chance to tx/rx traffic * before being taken out */ if (!bond_is_active_slave(slave) && !rcu_access_pointer(bond->current_arp_slave) && !bond_time_in_interval(bond, last_rx, bond->params.missed_max + 1)) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; } /* Active slave is down if: * - more than missed_max*delta since transmitting OR * - (more than missed_max*delta since receive AND * the bond has an IP address) */ last_tx = slave_last_tx(slave); if (bond_is_active_slave(slave) && (!bond_time_in_interval(bond, last_tx, bond->params.missed_max) || !bond_time_in_interval(bond, last_rx, bond->params.missed_max))) { bond_propose_link_state(slave, BOND_LINK_DOWN); commit++; } } return commit; } /* Called to commit link state changes noted by inspection step of * active-backup mode ARP monitor. * * Called with RTNL hold. */ static void bond_ab_arp_commit(struct bonding *bond) { bool do_failover = false; struct list_head *iter; unsigned long last_tx; struct slave *slave; bond_for_each_slave(bond, slave, iter) { switch (slave->link_new_state) { case BOND_LINK_NOCHANGE: continue; case BOND_LINK_UP: last_tx = slave_last_tx(slave); if (rtnl_dereference(bond->curr_active_slave) != slave || (!rtnl_dereference(bond->curr_active_slave) && bond_time_in_interval(bond, last_tx, 1))) { struct slave *current_arp_slave; current_arp_slave = rtnl_dereference(bond->current_arp_slave); bond_set_slave_link_state(slave, BOND_LINK_UP, BOND_SLAVE_NOTIFY_NOW); if (current_arp_slave) { bond_set_slave_inactive_flags( current_arp_slave, BOND_SLAVE_NOTIFY_NOW); RCU_INIT_POINTER(bond->current_arp_slave, NULL); } slave_info(bond->dev, slave->dev, "link status definitely up\n"); if (!rtnl_dereference(bond->curr_active_slave) || slave == rtnl_dereference(bond->primary_slave) || slave->prio > rtnl_dereference(bond->curr_active_slave)->prio) do_failover = true; } continue; case BOND_LINK_DOWN: if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; bond_set_slave_link_state(slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_NOW); bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); slave_info(bond->dev, slave->dev, "link status definitely down, disabling slave\n"); if (slave == rtnl_dereference(bond->curr_active_slave)) { RCU_INIT_POINTER(bond->current_arp_slave, NULL); do_failover = true; } continue; case BOND_LINK_FAIL: bond_set_slave_link_state(slave, BOND_LINK_FAIL, BOND_SLAVE_NOTIFY_NOW); bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); /* A slave has just been enslaved and has become * the current active slave. */ if (rtnl_dereference(bond->curr_active_slave)) RCU_INIT_POINTER(bond->current_arp_slave, NULL); continue; default: slave_err(bond->dev, slave->dev, "impossible: link_new_state %d on slave\n", slave->link_new_state); continue; } } if (do_failover) { block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); } bond_set_carrier(bond); } /* Send ARP probes for active-backup mode ARP monitor. * * Called with rcu_read_lock held. */ static bool bond_ab_arp_probe(struct bonding *bond) { struct slave *slave, *before = NULL, *new_slave = NULL, *curr_arp_slave = rcu_dereference(bond->current_arp_slave), *curr_active_slave = rcu_dereference(bond->curr_active_slave); struct list_head *iter; bool found = false; bool should_notify_rtnl = BOND_SLAVE_NOTIFY_LATER; if (curr_arp_slave && curr_active_slave) netdev_info(bond->dev, "PROBE: c_arp %s && cas %s BAD\n", curr_arp_slave->dev->name, curr_active_slave->dev->name); if (curr_active_slave) { bond_send_validate(bond, curr_active_slave); return should_notify_rtnl; } /* if we don't have a curr_active_slave, search for the next available * backup slave from the current_arp_slave and make it the candidate * for becoming the curr_active_slave */ if (!curr_arp_slave) { curr_arp_slave = bond_first_slave_rcu(bond); if (!curr_arp_slave) return should_notify_rtnl; } bond_for_each_slave_rcu(bond, slave, iter) { if (!found && !before && bond_slave_is_up(slave)) before = slave; if (found && !new_slave && bond_slave_is_up(slave)) new_slave = slave; /* if the link state is up at this point, we * mark it down - this can happen if we have * simultaneous link failures and * reselect_active_interface doesn't make this * one the current slave so it is still marked * up when it is actually down */ if (!bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) { bond_set_slave_link_state(slave, BOND_LINK_DOWN, BOND_SLAVE_NOTIFY_LATER); if (slave->link_failure_count < UINT_MAX) slave->link_failure_count++; bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_LATER); slave_info(bond->dev, slave->dev, "backup interface is now down\n"); } if (slave == curr_arp_slave) found = true; } if (!new_slave && before) new_slave = before; if (!new_slave) goto check_state; bond_set_slave_link_state(new_slave, BOND_LINK_BACK, BOND_SLAVE_NOTIFY_LATER); bond_set_slave_active_flags(new_slave, BOND_SLAVE_NOTIFY_LATER); bond_send_validate(bond, new_slave); new_slave->last_link_up = jiffies; rcu_assign_pointer(bond->current_arp_slave, new_slave); check_state: bond_for_each_slave_rcu(bond, slave, iter) { if (slave->should_notify || slave->should_notify_link) { should_notify_rtnl = BOND_SLAVE_NOTIFY_NOW; break; } } return should_notify_rtnl; } static void bond_activebackup_arp_mon(struct bonding *bond) { bool should_notify_peers = false; bool should_notify_rtnl = false; int delta_in_ticks; delta_in_ticks = msecs_to_jiffies(bond->params.arp_interval); if (!bond_has_slaves(bond)) goto re_arm; rcu_read_lock(); should_notify_peers = bond_should_notify_peers(bond); if (bond_ab_arp_inspect(bond)) { rcu_read_unlock(); /* Race avoidance with bond_close flush of workqueue */ if (!rtnl_trylock()) { delta_in_ticks = 1; should_notify_peers = false; goto re_arm; } bond_ab_arp_commit(bond); rtnl_unlock(); rcu_read_lock(); } should_notify_rtnl = bond_ab_arp_probe(bond); rcu_read_unlock(); re_arm: if (bond->params.arp_interval) queue_delayed_work(bond->wq, &bond->arp_work, delta_in_ticks); if (should_notify_peers || should_notify_rtnl) { if (!rtnl_trylock()) return; if (should_notify_peers) { bond->send_peer_notif--; call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, bond->dev); } if (should_notify_rtnl) { bond_slave_state_notify(bond); bond_slave_link_notify(bond); } rtnl_unlock(); } } static void bond_arp_monitor(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, arp_work.work); if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) bond_activebackup_arp_mon(bond); else bond_loadbalance_arp_mon(bond); } /*-------------------------- netdev event handling --------------------------*/ /* Change device name */ static int bond_event_changename(struct bonding *bond) { bond_remove_proc_entry(bond); bond_create_proc_entry(bond); bond_debug_reregister(bond); return NOTIFY_DONE; } static int bond_master_netdev_event(unsigned long event, struct net_device *bond_dev) { struct bonding *event_bond = netdev_priv(bond_dev); netdev_dbg(bond_dev, "%s called\n", __func__); switch (event) { case NETDEV_CHANGENAME: return bond_event_changename(event_bond); case NETDEV_UNREGISTER: bond_remove_proc_entry(event_bond); #ifdef CONFIG_XFRM_OFFLOAD xfrm_dev_state_flush(dev_net(bond_dev), bond_dev, true); #endif /* CONFIG_XFRM_OFFLOAD */ break; case NETDEV_REGISTER: bond_create_proc_entry(event_bond); break; default: break; } return NOTIFY_DONE; } static int bond_slave_netdev_event(unsigned long event, struct net_device *slave_dev) { struct slave *slave = bond_slave_get_rtnl(slave_dev), *primary; struct bonding *bond; struct net_device *bond_dev; /* A netdev event can be generated while enslaving a device * before netdev_rx_handler_register is called in which case * slave will be NULL */ if (!slave) { netdev_dbg(slave_dev, "%s called on NULL slave\n", __func__); return NOTIFY_DONE; } bond_dev = slave->bond->dev; bond = slave->bond; primary = rtnl_dereference(bond->primary_slave); slave_dbg(bond_dev, slave_dev, "%s called\n", __func__); switch (event) { case NETDEV_UNREGISTER: if (bond_dev->type != ARPHRD_ETHER) bond_release_and_destroy(bond_dev, slave_dev); else __bond_release_one(bond_dev, slave_dev, false, true); break; case NETDEV_UP: case NETDEV_CHANGE: /* For 802.3ad mode only: * Getting invalid Speed/Duplex values here will put slave * in weird state. Mark it as link-fail if the link was * previously up or link-down if it hasn't yet come up, and * let link-monitoring (miimon) set it right when correct * speeds/duplex are available. */ if (bond_update_speed_duplex(slave) && BOND_MODE(bond) == BOND_MODE_8023AD) { if (slave->last_link_up) slave->link = BOND_LINK_FAIL; else slave->link = BOND_LINK_DOWN; } if (BOND_MODE(bond) == BOND_MODE_8023AD) bond_3ad_adapter_speed_duplex_changed(slave); fallthrough; case NETDEV_DOWN: /* Refresh slave-array if applicable! * If the setup does not use miimon or arpmon (mode-specific!), * then these events will not cause the slave-array to be * refreshed. This will cause xmit to use a slave that is not * usable. Avoid such situation by refeshing the array at these * events. If these (miimon/arpmon) parameters are configured * then array gets refreshed twice and that should be fine! */ if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); break; case NETDEV_CHANGEMTU: /* TODO: Should slaves be allowed to * independently alter their MTU? For * an active-backup bond, slaves need * not be the same type of device, so * MTUs may vary. For other modes, * slaves arguably should have the * same MTUs. To do this, we'd need to * take over the slave's change_mtu * function for the duration of their * servitude. */ break; case NETDEV_CHANGENAME: /* we don't care if we don't have primary set */ if (!bond_uses_primary(bond) || !bond->params.primary[0]) break; if (slave == primary) { /* slave's name changed - he's no longer primary */ RCU_INIT_POINTER(bond->primary_slave, NULL); } else if (!strcmp(slave_dev->name, bond->params.primary)) { /* we have a new primary slave */ rcu_assign_pointer(bond->primary_slave, slave); } else { /* we didn't change primary - exit */ break; } netdev_info(bond->dev, "Primary slave changed to %s, reselecting active slave\n", primary ? slave_dev->name : "none"); block_netpoll_tx(); bond_select_active_slave(bond); unblock_netpoll_tx(); break; case NETDEV_FEAT_CHANGE: if (!bond->notifier_ctx) { bond->notifier_ctx = true; bond_compute_features(bond); bond->notifier_ctx = false; } break; case NETDEV_RESEND_IGMP: /* Propagate to master device */ call_netdevice_notifiers(event, slave->bond->dev); break; case NETDEV_XDP_FEAT_CHANGE: bond_xdp_set_features(bond_dev); break; default: break; } return NOTIFY_DONE; } /* bond_netdev_event: handle netdev notifier chain events. * * This function receives events for the netdev chain. The caller (an * ioctl handler calling blocking_notifier_call_chain) holds the necessary * locks for us to safely manipulate the slave devices (RTNL lock, * dev_probe_lock). */ static int bond_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *event_dev = netdev_notifier_info_to_dev(ptr); netdev_dbg(event_dev, "%s received %s\n", __func__, netdev_cmd_to_name(event)); if (!(event_dev->priv_flags & IFF_BONDING)) return NOTIFY_DONE; if (event_dev->flags & IFF_MASTER) { int ret; ret = bond_master_netdev_event(event, event_dev); if (ret != NOTIFY_DONE) return ret; } if (event_dev->flags & IFF_SLAVE) return bond_slave_netdev_event(event, event_dev); return NOTIFY_DONE; } static struct notifier_block bond_netdev_notifier = { .notifier_call = bond_netdev_event, }; /*---------------------------- Hashing Policies -----------------------------*/ /* Helper to access data in a packet, with or without a backing skb. * If skb is given the data is linearized if necessary via pskb_may_pull. */ static inline const void *bond_pull_data(struct sk_buff *skb, const void *data, int hlen, int n) { if (likely(n <= hlen)) return data; else if (skb && likely(pskb_may_pull(skb, n))) return skb->data; return NULL; } /* L2 hash helper */ static inline u32 bond_eth_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen) { struct ethhdr *ep; data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr)); if (!data) return 0; ep = (struct ethhdr *)(data + mhoff); return ep->h_dest[5] ^ ep->h_source[5] ^ be16_to_cpu(ep->h_proto); } static bool bond_flow_ip(struct sk_buff *skb, struct flow_keys *fk, const void *data, int hlen, __be16 l2_proto, int *nhoff, int *ip_proto, bool l34) { const struct ipv6hdr *iph6; const struct iphdr *iph; if (l2_proto == htons(ETH_P_IP)) { data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph)); if (!data) return false; iph = (const struct iphdr *)(data + *nhoff); iph_to_flow_copy_v4addrs(fk, iph); *nhoff += iph->ihl << 2; if (!ip_is_fragment(iph)) *ip_proto = iph->protocol; } else if (l2_proto == htons(ETH_P_IPV6)) { data = bond_pull_data(skb, data, hlen, *nhoff + sizeof(*iph6)); if (!data) return false; iph6 = (const struct ipv6hdr *)(data + *nhoff); iph_to_flow_copy_v6addrs(fk, iph6); *nhoff += sizeof(*iph6); *ip_proto = iph6->nexthdr; } else { return false; } if (l34 && *ip_proto >= 0) fk->ports.ports = skb_flow_get_ports(skb, *nhoff, *ip_proto, data, hlen); return true; } static u32 bond_vlan_srcmac_hash(struct sk_buff *skb, const void *data, int mhoff, int hlen) { u32 srcmac_vendor = 0, srcmac_dev = 0; struct ethhdr *mac_hdr; u16 vlan = 0; int i; data = bond_pull_data(skb, data, hlen, mhoff + sizeof(struct ethhdr)); if (!data) return 0; mac_hdr = (struct ethhdr *)(data + mhoff); for (i = 0; i < 3; i++) srcmac_vendor = (srcmac_vendor << 8) | mac_hdr->h_source[i]; for (i = 3; i < ETH_ALEN; i++) srcmac_dev = (srcmac_dev << 8) | mac_hdr->h_source[i]; if (skb && skb_vlan_tag_present(skb)) vlan = skb_vlan_tag_get(skb); return vlan ^ srcmac_vendor ^ srcmac_dev; } /* Extract the appropriate headers based on bond's xmit policy */ static bool bond_flow_dissect(struct bonding *bond, struct sk_buff *skb, const void *data, __be16 l2_proto, int nhoff, int hlen, struct flow_keys *fk) { bool l34 = bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER34; int ip_proto = -1; switch (bond->params.xmit_policy) { case BOND_XMIT_POLICY_ENCAP23: case BOND_XMIT_POLICY_ENCAP34: memset(fk, 0, sizeof(*fk)); return __skb_flow_dissect(NULL, skb, &flow_keys_bonding, fk, data, l2_proto, nhoff, hlen, 0); default: break; } fk->ports.ports = 0; memset(&fk->icmp, 0, sizeof(fk->icmp)); if (!bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34)) return false; /* ICMP error packets contains at least 8 bytes of the header * of the packet which generated the error. Use this information * to correlate ICMP error packets within the same flow which * generated the error. */ if (ip_proto == IPPROTO_ICMP || ip_proto == IPPROTO_ICMPV6) { skb_flow_get_icmp_tci(skb, &fk->icmp, data, nhoff, hlen); if (ip_proto == IPPROTO_ICMP) { if (!icmp_is_err(fk->icmp.type)) return true; nhoff += sizeof(struct icmphdr); } else if (ip_proto == IPPROTO_ICMPV6) { if (!icmpv6_is_err(fk->icmp.type)) return true; nhoff += sizeof(struct icmp6hdr); } return bond_flow_ip(skb, fk, data, hlen, l2_proto, &nhoff, &ip_proto, l34); } return true; } static u32 bond_ip_hash(u32 hash, struct flow_keys *flow, int xmit_policy) { hash ^= (__force u32)flow_get_u32_dst(flow) ^ (__force u32)flow_get_u32_src(flow); hash ^= (hash >> 16); hash ^= (hash >> 8); /* discard lowest hash bit to deal with the common even ports pattern */ if (xmit_policy == BOND_XMIT_POLICY_LAYER34 || xmit_policy == BOND_XMIT_POLICY_ENCAP34) return hash >> 1; return hash; } /* Generate hash based on xmit policy. If @skb is given it is used to linearize * the data as required, but this function can be used without it if the data is * known to be linear (e.g. with xdp_buff). */ static u32 __bond_xmit_hash(struct bonding *bond, struct sk_buff *skb, const void *data, __be16 l2_proto, int mhoff, int nhoff, int hlen) { struct flow_keys flow; u32 hash; if (bond->params.xmit_policy == BOND_XMIT_POLICY_VLAN_SRCMAC) return bond_vlan_srcmac_hash(skb, data, mhoff, hlen); if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER2 || !bond_flow_dissect(bond, skb, data, l2_proto, nhoff, hlen, &flow)) return bond_eth_hash(skb, data, mhoff, hlen); if (bond->params.xmit_policy == BOND_XMIT_POLICY_LAYER23 || bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP23) { hash = bond_eth_hash(skb, data, mhoff, hlen); } else { if (flow.icmp.id) memcpy(&hash, &flow.icmp, sizeof(hash)); else memcpy(&hash, &flow.ports.ports, sizeof(hash)); } return bond_ip_hash(hash, &flow, bond->params.xmit_policy); } /** * bond_xmit_hash - generate a hash value based on the xmit policy * @bond: bonding device * @skb: buffer to use for headers * * This function will extract the necessary headers from the skb buffer and use * them to generate a hash based on the xmit_policy set in the bonding device */ u32 bond_xmit_hash(struct bonding *bond, struct sk_buff *skb) { if (bond->params.xmit_policy == BOND_XMIT_POLICY_ENCAP34 && skb->l4_hash) return skb->hash; return __bond_xmit_hash(bond, skb, skb->data, skb->protocol, 0, skb_network_offset(skb), skb_headlen(skb)); } /** * bond_xmit_hash_xdp - generate a hash value based on the xmit policy * @bond: bonding device * @xdp: buffer to use for headers * * The XDP variant of bond_xmit_hash. */ static u32 bond_xmit_hash_xdp(struct bonding *bond, struct xdp_buff *xdp) { struct ethhdr *eth; if (xdp->data + sizeof(struct ethhdr) > xdp->data_end) return 0; eth = (struct ethhdr *)xdp->data; return __bond_xmit_hash(bond, NULL, xdp->data, eth->h_proto, 0, sizeof(struct ethhdr), xdp->data_end - xdp->data); } /*-------------------------- Device entry points ----------------------------*/ void bond_work_init_all(struct bonding *bond) { INIT_DELAYED_WORK(&bond->mcast_work, bond_resend_igmp_join_requests_delayed); INIT_DELAYED_WORK(&bond->alb_work, bond_alb_monitor); INIT_DELAYED_WORK(&bond->mii_work, bond_mii_monitor); INIT_DELAYED_WORK(&bond->arp_work, bond_arp_monitor); INIT_DELAYED_WORK(&bond->ad_work, bond_3ad_state_machine_handler); INIT_DELAYED_WORK(&bond->slave_arr_work, bond_slave_arr_handler); } void bond_work_cancel_all(struct bonding *bond) { cancel_delayed_work_sync(&bond->mii_work); cancel_delayed_work_sync(&bond->arp_work); cancel_delayed_work_sync(&bond->alb_work); cancel_delayed_work_sync(&bond->ad_work); cancel_delayed_work_sync(&bond->mcast_work); cancel_delayed_work_sync(&bond->slave_arr_work); } static int bond_open(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; if (BOND_MODE(bond) == BOND_MODE_ROUNDROBIN && !bond->rr_tx_counter) { bond->rr_tx_counter = alloc_percpu(u32); if (!bond->rr_tx_counter) return -ENOMEM; } /* reset slave->backup and slave->inactive */ if (bond_has_slaves(bond)) { bond_for_each_slave(bond, slave, iter) { if (bond_uses_primary(bond) && slave != rcu_access_pointer(bond->curr_active_slave)) { bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); } else if (BOND_MODE(bond) != BOND_MODE_8023AD) { bond_set_slave_active_flags(slave, BOND_SLAVE_NOTIFY_NOW); } } } if (bond_is_lb(bond)) { /* bond_alb_initialize must be called before the timer * is started. */ if (bond_alb_initialize(bond, (BOND_MODE(bond) == BOND_MODE_ALB))) return -ENOMEM; if (bond->params.tlb_dynamic_lb || BOND_MODE(bond) == BOND_MODE_ALB) queue_delayed_work(bond->wq, &bond->alb_work, 0); } if (bond->params.miimon) /* link check interval, in milliseconds. */ queue_delayed_work(bond->wq, &bond->mii_work, 0); if (bond->params.arp_interval) { /* arp interval, in milliseconds. */ queue_delayed_work(bond->wq, &bond->arp_work, 0); bond->recv_probe = bond_rcv_validate; } if (BOND_MODE(bond) == BOND_MODE_8023AD) { queue_delayed_work(bond->wq, &bond->ad_work, 0); /* register to receive LACPDUs */ bond->recv_probe = bond_3ad_lacpdu_recv; bond_3ad_initiate_agg_selection(bond, 1); bond_for_each_slave(bond, slave, iter) dev_mc_add(slave->dev, lacpdu_mcast_addr); if (bond->params.broadcast_neighbor) static_branch_inc(&bond_bcast_neigh_enabled); } if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); return 0; } static int bond_close(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; bond_work_cancel_all(bond); bond->send_peer_notif = 0; if (bond_is_lb(bond)) bond_alb_deinitialize(bond); bond->recv_probe = NULL; if (BOND_MODE(bond) == BOND_MODE_8023AD && bond->params.broadcast_neighbor) static_branch_dec(&bond_bcast_neigh_enabled); if (bond_uses_primary(bond)) { rcu_read_lock(); slave = rcu_dereference(bond->curr_active_slave); if (slave) bond_hw_addr_flush(bond_dev, slave->dev); rcu_read_unlock(); } else { struct list_head *iter; bond_for_each_slave(bond, slave, iter) bond_hw_addr_flush(bond_dev, slave->dev); } return 0; } /* fold stats, assuming all rtnl_link_stats64 fields are u64, but * that some drivers can provide 32bit values only. */ static void bond_fold_stats(struct rtnl_link_stats64 *_res, const struct rtnl_link_stats64 *_new, const struct rtnl_link_stats64 *_old) { const u64 *new = (const u64 *)_new; const u64 *old = (const u64 *)_old; u64 *res = (u64 *)_res; int i; for (i = 0; i < sizeof(*_res) / sizeof(u64); i++) { u64 nv = new[i]; u64 ov = old[i]; s64 delta = nv - ov; /* detects if this particular field is 32bit only */ if (((nv | ov) >> 32) == 0) delta = (s64)(s32)((u32)nv - (u32)ov); /* filter anomalies, some drivers reset their stats * at down/up events. */ if (delta > 0) res[i] += delta; } } #ifdef CONFIG_LOCKDEP static int bond_get_lowest_level_rcu(struct net_device *dev) { struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1]; struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1]; int cur = 0, max = 0; now = dev; iter = &dev->adj_list.lower; while (1) { next = NULL; while (1) { ldev = netdev_next_lower_dev_rcu(now, &iter); if (!ldev) break; next = ldev; niter = &ldev->adj_list.lower; dev_stack[cur] = now; iter_stack[cur++] = iter; if (max <= cur) max = cur; break; } if (!next) { if (!cur) return max; next = dev_stack[--cur]; niter = iter_stack[cur]; } now = next; iter = niter; } return max; } #endif static void bond_get_stats(struct net_device *bond_dev, struct rtnl_link_stats64 *stats) { struct bonding *bond = netdev_priv(bond_dev); struct rtnl_link_stats64 temp; struct list_head *iter; struct slave *slave; int nest_level = 0; rcu_read_lock(); #ifdef CONFIG_LOCKDEP nest_level = bond_get_lowest_level_rcu(bond_dev); #endif spin_lock_nested(&bond->stats_lock, nest_level); memcpy(stats, &bond->bond_stats, sizeof(*stats)); bond_for_each_slave_rcu(bond, slave, iter) { const struct rtnl_link_stats64 *new = dev_get_stats(slave->dev, &temp); bond_fold_stats(stats, new, &slave->slave_stats); /* save off the slave stats for the next run */ memcpy(&slave->slave_stats, new, sizeof(*new)); } memcpy(&bond->bond_stats, stats, sizeof(*stats)); spin_unlock(&bond->stats_lock); rcu_read_unlock(); } static int bond_eth_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd) { struct bonding *bond = netdev_priv(bond_dev); struct mii_ioctl_data *mii = NULL; netdev_dbg(bond_dev, "bond_eth_ioctl: cmd=%d\n", cmd); switch (cmd) { case SIOCGMIIPHY: mii = if_mii(ifr); if (!mii) return -EINVAL; mii->phy_id = 0; fallthrough; case SIOCGMIIREG: /* We do this again just in case we were called by SIOCGMIIREG * instead of SIOCGMIIPHY. */ mii = if_mii(ifr); if (!mii) return -EINVAL; if (mii->reg_num == 1) { mii->val_out = 0; if (netif_carrier_ok(bond->dev)) mii->val_out = BMSR_LSTATUS; } break; default: return -EOPNOTSUPP; } return 0; } static int bond_do_ioctl(struct net_device *bond_dev, struct ifreq *ifr, int cmd) { struct bonding *bond = netdev_priv(bond_dev); struct net_device *slave_dev = NULL; struct ifbond k_binfo; struct ifbond __user *u_binfo = NULL; struct ifslave k_sinfo; struct ifslave __user *u_sinfo = NULL; struct bond_opt_value newval; struct net *net; int res = 0; netdev_dbg(bond_dev, "bond_ioctl: cmd=%d\n", cmd); switch (cmd) { case SIOCBONDINFOQUERY: u_binfo = (struct ifbond __user *)ifr->ifr_data; if (copy_from_user(&k_binfo, u_binfo, sizeof(ifbond))) return -EFAULT; bond_info_query(bond_dev, &k_binfo); if (copy_to_user(u_binfo, &k_binfo, sizeof(ifbond))) return -EFAULT; return 0; case SIOCBONDSLAVEINFOQUERY: u_sinfo = (struct ifslave __user *)ifr->ifr_data; if (copy_from_user(&k_sinfo, u_sinfo, sizeof(ifslave))) return -EFAULT; res = bond_slave_info_query(bond_dev, &k_sinfo); if (res == 0 && copy_to_user(u_sinfo, &k_sinfo, sizeof(ifslave))) return -EFAULT; return res; default: break; } net = dev_net(bond_dev); if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; slave_dev = __dev_get_by_name(net, ifr->ifr_slave); slave_dbg(bond_dev, slave_dev, "slave_dev=%p:\n", slave_dev); if (!slave_dev) return -ENODEV; switch (cmd) { case SIOCBONDENSLAVE: res = bond_enslave(bond_dev, slave_dev, NULL); break; case SIOCBONDRELEASE: res = bond_release(bond_dev, slave_dev); break; case SIOCBONDSETHWADDR: res = bond_set_dev_addr(bond_dev, slave_dev); break; case SIOCBONDCHANGEACTIVE: bond_opt_initstr(&newval, slave_dev->name); res = __bond_opt_set_notify(bond, BOND_OPT_ACTIVE_SLAVE, &newval); break; default: res = -EOPNOTSUPP; } return res; } static int bond_siocdevprivate(struct net_device *bond_dev, struct ifreq *ifr, void __user *data, int cmd) { struct ifreq ifrdata = { .ifr_data = data }; switch (cmd) { case BOND_INFO_QUERY_OLD: return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDINFOQUERY); case BOND_SLAVE_INFO_QUERY_OLD: return bond_do_ioctl(bond_dev, &ifrdata, SIOCBONDSLAVEINFOQUERY); case BOND_ENSLAVE_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDENSLAVE); case BOND_RELEASE_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDRELEASE); case BOND_SETHWADDR_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDSETHWADDR); case BOND_CHANGE_ACTIVE_OLD: return bond_do_ioctl(bond_dev, ifr, SIOCBONDCHANGEACTIVE); } return -EOPNOTSUPP; } static void bond_change_rx_flags(struct net_device *bond_dev, int change) { struct bonding *bond = netdev_priv(bond_dev); if (change & IFF_PROMISC) bond_set_promiscuity(bond, bond_dev->flags & IFF_PROMISC ? 1 : -1); if (change & IFF_ALLMULTI) bond_set_allmulti(bond, bond_dev->flags & IFF_ALLMULTI ? 1 : -1); } static void bond_set_rx_mode(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; rcu_read_lock(); if (bond_uses_primary(bond)) { slave = rcu_dereference(bond->curr_active_slave); if (slave) { dev_uc_sync(slave->dev, bond_dev); dev_mc_sync(slave->dev, bond_dev); } } else { bond_for_each_slave_rcu(bond, slave, iter) { dev_uc_sync_multiple(slave->dev, bond_dev); dev_mc_sync_multiple(slave->dev, bond_dev); } } rcu_read_unlock(); } static int bond_neigh_init(struct neighbour *n) { struct bonding *bond = netdev_priv(n->dev); const struct net_device_ops *slave_ops; struct neigh_parms parms; struct slave *slave; int ret = 0; rcu_read_lock(); slave = bond_first_slave_rcu(bond); if (!slave) goto out; slave_ops = slave->dev->netdev_ops; if (!slave_ops->ndo_neigh_setup) goto out; /* TODO: find another way [1] to implement this. * Passing a zeroed structure is fragile, * but at least we do not pass garbage. * * [1] One way would be that ndo_neigh_setup() never touch * struct neigh_parms, but propagate the new neigh_setup() * back to ___neigh_create() / neigh_parms_alloc() */ memset(&parms, 0, sizeof(parms)); ret = slave_ops->ndo_neigh_setup(slave->dev, &parms); if (ret) goto out; if (parms.neigh_setup) ret = parms.neigh_setup(n); out: rcu_read_unlock(); return ret; } /* The bonding ndo_neigh_setup is called at init time beofre any * slave exists. So we must declare proxy setup function which will * be used at run time to resolve the actual slave neigh param setup. * * It's also called by master devices (such as vlans) to setup their * underlying devices. In that case - do nothing, we're already set up from * our init. */ static int bond_neigh_setup(struct net_device *dev, struct neigh_parms *parms) { /* modify only our neigh_parms */ if (parms->dev == dev) parms->neigh_setup = bond_neigh_init; return 0; } /* Change the MTU of all of a master's slaves to match the master */ static int bond_change_mtu(struct net_device *bond_dev, int new_mtu) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; struct list_head *iter; int res = 0; netdev_dbg(bond_dev, "bond=%p, new_mtu=%d\n", bond, new_mtu); bond_for_each_slave(bond, slave, iter) { slave_dbg(bond_dev, slave->dev, "s %p c_m %p\n", slave, slave->dev->netdev_ops->ndo_change_mtu); res = dev_set_mtu(slave->dev, new_mtu); if (res) { /* If we failed to set the slave's mtu to the new value * we must abort the operation even in ACTIVE_BACKUP * mode, because if we allow the backup slaves to have * different mtu values than the active slave we'll * need to change their mtu when doing a failover. That * means changing their mtu from timer context, which * is probably not a good idea. */ slave_dbg(bond_dev, slave->dev, "err %d setting mtu to %d\n", res, new_mtu); goto unwind; } } WRITE_ONCE(bond_dev->mtu, new_mtu); return 0; unwind: /* unwind from head to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { int tmp_res; if (rollback_slave == slave) break; tmp_res = dev_set_mtu(rollback_slave->dev, bond_dev->mtu); if (tmp_res) slave_dbg(bond_dev, rollback_slave->dev, "unwind err %d\n", tmp_res); } return res; } /* Change HW address * * Note that many devices must be down to change the HW address, and * downing the master releases all slaves. We can make bonds full of * bonding devices to test this, however. */ static int bond_set_mac_address(struct net_device *bond_dev, void *addr) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave, *rollback_slave; struct sockaddr_storage *ss = addr, tmp_ss; struct list_head *iter; int res = 0; if (BOND_MODE(bond) == BOND_MODE_ALB) return bond_alb_set_mac_address(bond_dev, addr); netdev_dbg(bond_dev, "%s: bond=%p\n", __func__, bond); /* If fail_over_mac is enabled, do nothing and return success. * Returning an error causes ifenslave to fail. */ if (bond->params.fail_over_mac && BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) return 0; if (!is_valid_ether_addr(ss->__data)) return -EADDRNOTAVAIL; bond_for_each_slave(bond, slave, iter) { slave_dbg(bond_dev, slave->dev, "%s: slave=%p\n", __func__, slave); res = dev_set_mac_address(slave->dev, addr, NULL); if (res) { /* TODO: consider downing the slave * and retry ? * User should expect communications * breakage anyway until ARP finish * updating, so... */ slave_dbg(bond_dev, slave->dev, "%s: err %d\n", __func__, res); goto unwind; } } /* success */ dev_addr_set(bond_dev, ss->__data); return 0; unwind: memcpy(tmp_ss.__data, bond_dev->dev_addr, bond_dev->addr_len); tmp_ss.ss_family = bond_dev->type; /* unwind from head to the slave that failed */ bond_for_each_slave(bond, rollback_slave, iter) { int tmp_res; if (rollback_slave == slave) break; tmp_res = dev_set_mac_address(rollback_slave->dev, &tmp_ss, NULL); if (tmp_res) { slave_dbg(bond_dev, rollback_slave->dev, "%s: unwind err %d\n", __func__, tmp_res); } } return res; } /** * bond_get_slave_by_id - get xmit slave with slave_id * @bond: bonding device that is transmitting * @slave_id: slave id up to slave_cnt-1 through which to transmit * * This function tries to get slave with slave_id but in case * it fails, it tries to find the first available slave for transmission. */ static struct slave *bond_get_slave_by_id(struct bonding *bond, int slave_id) { struct list_head *iter; struct slave *slave; int i = slave_id; /* Here we start from the slave with slave_id */ bond_for_each_slave_rcu(bond, slave, iter) { if (--i < 0) { if (bond_slave_can_tx(slave)) return slave; } } /* Here we start from the first slave up to slave_id */ i = slave_id; bond_for_each_slave_rcu(bond, slave, iter) { if (--i < 0) break; if (bond_slave_can_tx(slave)) return slave; } /* no slave that can tx has been found */ return NULL; } /** * bond_rr_gen_slave_id - generate slave id based on packets_per_slave * @bond: bonding device to use * * Based on the value of the bonding device's packets_per_slave parameter * this function generates a slave id, which is usually used as the next * slave to transmit through. */ static u32 bond_rr_gen_slave_id(struct bonding *bond) { u32 slave_id; struct reciprocal_value reciprocal_packets_per_slave; int packets_per_slave = bond->params.packets_per_slave; switch (packets_per_slave) { case 0: slave_id = get_random_u32(); break; case 1: slave_id = this_cpu_inc_return(*bond->rr_tx_counter); break; default: reciprocal_packets_per_slave = bond->params.reciprocal_packets_per_slave; slave_id = this_cpu_inc_return(*bond->rr_tx_counter); slave_id = reciprocal_divide(slave_id, reciprocal_packets_per_slave); break; } return slave_id; } static struct slave *bond_xmit_roundrobin_slave_get(struct bonding *bond, struct sk_buff *skb) { struct slave *slave; int slave_cnt; u32 slave_id; /* Start with the curr_active_slave that joined the bond as the * default for sending IGMP traffic. For failover purposes one * needs to maintain some consistency for the interface that will * send the join/membership reports. The curr_active_slave found * will send all of this type of traffic. */ if (skb->protocol == htons(ETH_P_IP)) { int noff = skb_network_offset(skb); struct iphdr *iph; if (unlikely(!pskb_may_pull(skb, noff + sizeof(*iph)))) goto non_igmp; iph = ip_hdr(skb); if (iph->protocol == IPPROTO_IGMP) { slave = rcu_dereference(bond->curr_active_slave); if (slave) return slave; return bond_get_slave_by_id(bond, 0); } } non_igmp: slave_cnt = READ_ONCE(bond->slave_cnt); if (likely(slave_cnt)) { slave_id = bond_rr_gen_slave_id(bond) % slave_cnt; return bond_get_slave_by_id(bond, slave_id); } return NULL; } static struct slave *bond_xdp_xmit_roundrobin_slave_get(struct bonding *bond, struct xdp_buff *xdp) { struct slave *slave; int slave_cnt; u32 slave_id; const struct ethhdr *eth; void *data = xdp->data; if (data + sizeof(struct ethhdr) > xdp->data_end) goto non_igmp; eth = (struct ethhdr *)data; data += sizeof(struct ethhdr); /* See comment on IGMP in bond_xmit_roundrobin_slave_get() */ if (eth->h_proto == htons(ETH_P_IP)) { const struct iphdr *iph; if (data + sizeof(struct iphdr) > xdp->data_end) goto non_igmp; iph = (struct iphdr *)data; if (iph->protocol == IPPROTO_IGMP) { slave = rcu_dereference(bond->curr_active_slave); if (slave) return slave; return bond_get_slave_by_id(bond, 0); } } non_igmp: slave_cnt = READ_ONCE(bond->slave_cnt); if (likely(slave_cnt)) { slave_id = bond_rr_gen_slave_id(bond) % slave_cnt; return bond_get_slave_by_id(bond, slave_id); } return NULL; } static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; slave = bond_xmit_roundrobin_slave_get(bond, skb); if (likely(slave)) return bond_dev_queue_xmit(bond, skb, slave->dev); return bond_tx_drop(bond_dev, skb); } static struct slave *bond_xmit_activebackup_slave_get(struct bonding *bond) { return rcu_dereference(bond->curr_active_slave); } /* In active-backup mode, we know that bond->curr_active_slave is always valid if * the bond has a usable interface. */ static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb, struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; slave = bond_xmit_activebackup_slave_get(bond); if (slave) return bond_dev_queue_xmit(bond, skb, slave->dev); return bond_tx_drop(bond_dev, skb); } /* Use this to update slave_array when (a) it's not appropriate to update * slave_array right away (note that update_slave_array() may sleep) * and / or (b) RTNL is not held. */ void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay) { queue_delayed_work(bond->wq, &bond->slave_arr_work, delay); } /* Slave array work handler. Holds only RTNL */ static void bond_slave_arr_handler(struct work_struct *work) { struct bonding *bond = container_of(work, struct bonding, slave_arr_work.work); int ret; if (!rtnl_trylock()) goto err; ret = bond_update_slave_arr(bond, NULL); rtnl_unlock(); if (ret) { pr_warn_ratelimited("Failed to update slave array from WT\n"); goto err; } return; err: bond_slave_arr_work_rearm(bond, 1); } static void bond_skip_slave(struct bond_up_slave *slaves, struct slave *skipslave) { int idx; /* Rare situation where caller has asked to skip a specific * slave but allocation failed (most likely!). BTW this is * only possible when the call is initiated from * __bond_release_one(). In this situation; overwrite the * skipslave entry in the array with the last entry from the * array to avoid a situation where the xmit path may choose * this to-be-skipped slave to send a packet out. */ for (idx = 0; slaves && idx < slaves->count; idx++) { if (skipslave == slaves->arr[idx]) { slaves->arr[idx] = slaves->arr[slaves->count - 1]; slaves->count--; break; } } } static void bond_set_slave_arr(struct bonding *bond, struct bond_up_slave *usable_slaves, struct bond_up_slave *all_slaves) { struct bond_up_slave *usable, *all; usable = rtnl_dereference(bond->usable_slaves); rcu_assign_pointer(bond->usable_slaves, usable_slaves); kfree_rcu(usable, rcu); all = rtnl_dereference(bond->all_slaves); rcu_assign_pointer(bond->all_slaves, all_slaves); kfree_rcu(all, rcu); } static void bond_reset_slave_arr(struct bonding *bond) { bond_set_slave_arr(bond, NULL, NULL); } /* Build the usable slaves array in control path for modes that use xmit-hash * to determine the slave interface - * (a) BOND_MODE_8023AD * (b) BOND_MODE_XOR * (c) (BOND_MODE_TLB || BOND_MODE_ALB) && tlb_dynamic_lb == 0 * * The caller is expected to hold RTNL only and NO other lock! */ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) { struct bond_up_slave *usable_slaves = NULL, *all_slaves = NULL; struct slave *slave; struct list_head *iter; int agg_id = 0; int ret = 0; might_sleep(); usable_slaves = kzalloc(struct_size(usable_slaves, arr, bond->slave_cnt), GFP_KERNEL); all_slaves = kzalloc(struct_size(all_slaves, arr, bond->slave_cnt), GFP_KERNEL); if (!usable_slaves || !all_slaves) { ret = -ENOMEM; goto out; } if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info; spin_lock_bh(&bond->mode_lock); if (bond_3ad_get_active_agg_info(bond, &ad_info)) { spin_unlock_bh(&bond->mode_lock); pr_debug("bond_3ad_get_active_agg_info failed\n"); /* No active aggragator means it's not safe to use * the previous array. */ bond_reset_slave_arr(bond); goto out; } spin_unlock_bh(&bond->mode_lock); agg_id = ad_info.aggregator_id; } bond_for_each_slave(bond, slave, iter) { if (skipslave == slave) continue; all_slaves->arr[all_slaves->count++] = slave; if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct aggregator *agg; agg = SLAVE_AD_INFO(slave)->port.aggregator; if (!agg || agg->aggregator_identifier != agg_id) continue; } if (!bond_slave_can_tx(slave)) continue; slave_dbg(bond->dev, slave->dev, "Adding slave to tx hash array[%d]\n", usable_slaves->count); usable_slaves->arr[usable_slaves->count++] = slave; } bond_set_slave_arr(bond, usable_slaves, all_slaves); return ret; out: if (ret != 0 && skipslave) { bond_skip_slave(rtnl_dereference(bond->all_slaves), skipslave); bond_skip_slave(rtnl_dereference(bond->usable_slaves), skipslave); } kfree_rcu(all_slaves, rcu); kfree_rcu(usable_slaves, rcu); return ret; } static struct slave *bond_xmit_3ad_xor_slave_get(struct bonding *bond, struct sk_buff *skb, struct bond_up_slave *slaves) { struct slave *slave; unsigned int count; u32 hash; hash = bond_xmit_hash(bond, skb); count = slaves ? READ_ONCE(slaves->count) : 0; if (unlikely(!count)) return NULL; slave = slaves->arr[hash % count]; return slave; } static struct slave *bond_xdp_xmit_3ad_xor_slave_get(struct bonding *bond, struct xdp_buff *xdp) { struct bond_up_slave *slaves; unsigned int count; u32 hash; hash = bond_xmit_hash_xdp(bond, xdp); slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (unlikely(!count)) return NULL; return slaves->arr[hash % count]; } static bool bond_should_broadcast_neighbor(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); struct { struct ipv6hdr ip6; struct icmp6hdr icmp6; } *combined, _combined; if (!static_branch_unlikely(&bond_bcast_neigh_enabled)) return false; if (!bond->params.broadcast_neighbor) return false; if (skb->protocol == htons(ETH_P_ARP)) return true; if (skb->protocol == htons(ETH_P_IPV6)) { combined = skb_header_pointer(skb, skb_mac_header_len(skb), sizeof(_combined), &_combined); if (combined && combined->ip6.nexthdr == NEXTHDR_ICMP && (combined->icmp6.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || combined->icmp6.icmp6_type == NDISC_NEIGHBOUR_ADVERTISEMENT)) return true; } return false; } /* Use this Xmit function for 3AD as well as XOR modes. The current * usable slave array is formed in the control path. The xmit function * just calculates hash and sends the packet out. */ static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); struct bond_up_slave *slaves; struct slave *slave; slaves = rcu_dereference(bond->usable_slaves); slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves); if (likely(slave)) return bond_dev_queue_xmit(bond, skb, slave->dev); return bond_tx_drop(dev, skb); } /* in broadcast mode, we send everything to all or usable slave interfaces. * under rcu_read_lock when this function is called. */ static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb, struct net_device *bond_dev, bool all_slaves) { struct bonding *bond = netdev_priv(bond_dev); struct bond_up_slave *slaves; bool xmit_suc = false; bool skb_used = false; int slaves_count, i; if (all_slaves) slaves = rcu_dereference(bond->all_slaves); else slaves = rcu_dereference(bond->usable_slaves); slaves_count = slaves ? READ_ONCE(slaves->count) : 0; for (i = 0; i < slaves_count; i++) { struct slave *slave = slaves->arr[i]; struct sk_buff *skb2; if (!(bond_slave_is_up(slave) && slave->link == BOND_LINK_UP)) continue; if (bond_is_last_slave(bond, slave)) { skb2 = skb; skb_used = true; } else { skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) { net_err_ratelimited("%s: Error: %s: skb_clone() failed\n", bond_dev->name, __func__); continue; } } if (bond_dev_queue_xmit(bond, skb2, slave->dev) == NETDEV_TX_OK) xmit_suc = true; } if (!skb_used) dev_kfree_skb_any(skb); if (xmit_suc) return NETDEV_TX_OK; dev_core_stats_tx_dropped_inc(bond_dev); return NET_XMIT_DROP; } /*------------------------- Device initialization ---------------------------*/ /* Lookup the slave that corresponds to a qid */ static inline int bond_slave_override(struct bonding *bond, struct sk_buff *skb) { struct slave *slave = NULL; struct list_head *iter; if (!skb_rx_queue_recorded(skb)) return 1; /* Find out if any slaves have the same mapping as this skb. */ bond_for_each_slave_rcu(bond, slave, iter) { if (READ_ONCE(slave->queue_id) == skb_get_queue_mapping(skb)) { if (bond_slave_is_up(slave) && slave->link == BOND_LINK_UP) { bond_dev_queue_xmit(bond, skb, slave->dev); return 0; } /* If the slave isn't UP, use default transmit policy. */ break; } } return 1; } static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { /* This helper function exists to help dev_pick_tx get the correct * destination queue. Using a helper function skips a call to * skb_tx_hash and will put the skbs in the queue we expect on their * way down to the bonding driver. */ u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0; /* Save the original txq to restore before passing to the driver */ qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb_get_queue_mapping(skb); if (unlikely(txq >= dev->real_num_tx_queues)) { do { txq -= dev->real_num_tx_queues; } while (txq >= dev->real_num_tx_queues); } return txq; } static struct net_device *bond_xmit_get_slave(struct net_device *master_dev, struct sk_buff *skb, bool all_slaves) { struct bonding *bond = netdev_priv(master_dev); struct bond_up_slave *slaves; struct slave *slave = NULL; switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: slave = bond_xmit_roundrobin_slave_get(bond, skb); break; case BOND_MODE_ACTIVEBACKUP: slave = bond_xmit_activebackup_slave_get(bond); break; case BOND_MODE_8023AD: case BOND_MODE_XOR: if (all_slaves) slaves = rcu_dereference(bond->all_slaves); else slaves = rcu_dereference(bond->usable_slaves); slave = bond_xmit_3ad_xor_slave_get(bond, skb, slaves); break; case BOND_MODE_BROADCAST: break; case BOND_MODE_ALB: slave = bond_xmit_alb_slave_get(bond, skb); break; case BOND_MODE_TLB: slave = bond_xmit_tlb_slave_get(bond, skb); break; default: /* Should never happen, mode already checked */ WARN_ONCE(true, "Unknown bonding mode"); break; } if (slave) return slave->dev; return NULL; } static void bond_sk_to_flow(struct sock *sk, struct flow_keys *flow) { switch (sk->sk_family) { #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: if (ipv6_only_sock(sk) || ipv6_addr_type(&sk->sk_v6_daddr) != IPV6_ADDR_MAPPED) { flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; flow->addrs.v6addrs.src = inet6_sk(sk)->saddr; flow->addrs.v6addrs.dst = sk->sk_v6_daddr; break; } fallthrough; #endif default: /* AF_INET */ flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; flow->addrs.v4addrs.src = inet_sk(sk)->inet_rcv_saddr; flow->addrs.v4addrs.dst = inet_sk(sk)->inet_daddr; break; } flow->ports.src = inet_sk(sk)->inet_sport; flow->ports.dst = inet_sk(sk)->inet_dport; } /** * bond_sk_hash_l34 - generate a hash value based on the socket's L3 and L4 fields * @sk: socket to use for headers * * This function will extract the necessary field from the socket and use * them to generate a hash based on the LAYER34 xmit_policy. * Assumes that sk is a TCP or UDP socket. */ static u32 bond_sk_hash_l34(struct sock *sk) { struct flow_keys flow; u32 hash; bond_sk_to_flow(sk, &flow); /* L4 */ memcpy(&hash, &flow.ports.ports, sizeof(hash)); /* L3 */ return bond_ip_hash(hash, &flow, BOND_XMIT_POLICY_LAYER34); } static struct net_device *__bond_sk_get_lower_dev(struct bonding *bond, struct sock *sk) { struct bond_up_slave *slaves; struct slave *slave; unsigned int count; u32 hash; slaves = rcu_dereference(bond->usable_slaves); count = slaves ? READ_ONCE(slaves->count) : 0; if (unlikely(!count)) return NULL; hash = bond_sk_hash_l34(sk); slave = slaves->arr[hash % count]; return slave->dev; } static struct net_device *bond_sk_get_lower_dev(struct net_device *dev, struct sock *sk) { struct bonding *bond = netdev_priv(dev); struct net_device *lower = NULL; rcu_read_lock(); if (bond_sk_check(bond)) lower = __bond_sk_get_lower_dev(bond, sk); rcu_read_unlock(); return lower; } #if IS_ENABLED(CONFIG_TLS_DEVICE) static netdev_tx_t bond_tls_device_xmit(struct bonding *bond, struct sk_buff *skb, struct net_device *dev) { struct net_device *tls_netdev = rcu_dereference(tls_get_ctx(skb->sk)->netdev); /* tls_netdev might become NULL, even if tls_is_skb_tx_device_offloaded * was true, if tls_device_down is running in parallel, but it's OK, * because bond_get_slave_by_dev has a NULL check. */ if (likely(bond_get_slave_by_dev(bond, tls_netdev))) return bond_dev_queue_xmit(bond, skb, tls_netdev); return bond_tx_drop(dev, skb); } #endif static netdev_tx_t __bond_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); if (bond_should_override_tx_queue(bond) && !bond_slave_override(bond, skb)) return NETDEV_TX_OK; #if IS_ENABLED(CONFIG_TLS_DEVICE) if (tls_is_skb_tx_device_offloaded(skb)) return bond_tls_device_xmit(bond, skb, dev); #endif switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: return bond_xmit_roundrobin(skb, dev); case BOND_MODE_ACTIVEBACKUP: return bond_xmit_activebackup(skb, dev); case BOND_MODE_8023AD: if (bond_should_broadcast_neighbor(skb, dev)) return bond_xmit_broadcast(skb, dev, false); fallthrough; case BOND_MODE_XOR: return bond_3ad_xor_xmit(skb, dev); case BOND_MODE_BROADCAST: return bond_xmit_broadcast(skb, dev, true); case BOND_MODE_ALB: return bond_alb_xmit(skb, dev); case BOND_MODE_TLB: return bond_tlb_xmit(skb, dev); default: /* Should never happen, mode already checked */ netdev_err(dev, "Unknown bonding mode %d\n", BOND_MODE(bond)); WARN_ON_ONCE(1); return bond_tx_drop(dev, skb); } } static netdev_tx_t bond_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct bonding *bond = netdev_priv(dev); netdev_tx_t ret = NETDEV_TX_OK; /* If we risk deadlock from transmitting this in the * netpoll path, tell netpoll to queue the frame for later tx */ if (unlikely(is_netpoll_tx_blocked(dev))) return NETDEV_TX_BUSY; rcu_read_lock(); if (bond_has_slaves(bond)) ret = __bond_start_xmit(skb, dev); else ret = bond_tx_drop(dev, skb); rcu_read_unlock(); return ret; } static struct net_device * bond_xdp_get_xmit_slave(struct net_device *bond_dev, struct xdp_buff *xdp) { struct bonding *bond = netdev_priv(bond_dev); struct slave *slave; /* Caller needs to hold rcu_read_lock() */ switch (BOND_MODE(bond)) { case BOND_MODE_ROUNDROBIN: slave = bond_xdp_xmit_roundrobin_slave_get(bond, xdp); break; case BOND_MODE_ACTIVEBACKUP: slave = bond_xmit_activebackup_slave_get(bond); break; case BOND_MODE_8023AD: case BOND_MODE_XOR: slave = bond_xdp_xmit_3ad_xor_slave_get(bond, xdp); break; default: if (net_ratelimit()) netdev_err(bond_dev, "Unknown bonding mode %d for xdp xmit\n", BOND_MODE(bond)); return NULL; } if (slave) return slave->dev; return NULL; } static int bond_xdp_xmit(struct net_device *bond_dev, int n, struct xdp_frame **frames, u32 flags) { int nxmit, err = -ENXIO; rcu_read_lock(); for (nxmit = 0; nxmit < n; nxmit++) { struct xdp_frame *frame = frames[nxmit]; struct xdp_frame *frames1[] = {frame}; struct net_device *slave_dev; struct xdp_buff xdp; xdp_convert_frame_to_buff(frame, &xdp); slave_dev = bond_xdp_get_xmit_slave(bond_dev, &xdp); if (!slave_dev) { err = -ENXIO; break; } err = slave_dev->netdev_ops->ndo_xdp_xmit(slave_dev, 1, frames1, flags); if (err < 1) break; } rcu_read_unlock(); /* If error happened on the first frame then we can pass the error up, otherwise * report the number of frames that were xmitted. */ if (err < 0) return (nxmit == 0 ? err : nxmit); return nxmit; } static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog, struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(dev); struct list_head *iter; struct slave *slave, *rollback_slave; struct bpf_prog *old_prog; struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, .prog = prog, .extack = extack, }; int err; ASSERT_RTNL(); if (!bond_xdp_check(bond, BOND_MODE(bond))) { BOND_NL_ERR(dev, extack, "No native XDP support for the current bonding mode"); return -EOPNOTSUPP; } old_prog = bond->xdp_prog; bond->xdp_prog = prog; bond_for_each_slave(bond, slave, iter) { struct net_device *slave_dev = slave->dev; if (!slave_dev->netdev_ops->ndo_bpf || !slave_dev->netdev_ops->ndo_xdp_xmit) { SLAVE_NL_ERR(dev, slave_dev, extack, "Slave device does not support XDP"); err = -EOPNOTSUPP; goto err; } if (dev_xdp_prog_count(slave_dev) > 0) { SLAVE_NL_ERR(dev, slave_dev, extack, "Slave has XDP program loaded, please unload before enslaving"); err = -EOPNOTSUPP; goto err; } err = dev_xdp_propagate(slave_dev, &xdp); if (err < 0) { /* ndo_bpf() sets extack error message */ slave_err(dev, slave_dev, "Error %d calling ndo_bpf\n", err); goto err; } if (prog) bpf_prog_inc(prog); } if (prog) { static_branch_inc(&bpf_master_redirect_enabled_key); } else if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&bpf_master_redirect_enabled_key); } return 0; err: /* unwind the program changes */ bond->xdp_prog = old_prog; xdp.prog = old_prog; xdp.extack = NULL; /* do not overwrite original error */ bond_for_each_slave(bond, rollback_slave, iter) { struct net_device *slave_dev = rollback_slave->dev; int err_unwind; if (slave == rollback_slave) break; err_unwind = dev_xdp_propagate(slave_dev, &xdp); if (err_unwind < 0) slave_err(dev, slave_dev, "Error %d when unwinding XDP program change\n", err_unwind); else if (xdp.prog) bpf_prog_inc(xdp.prog); } return err; } static int bond_xdp(struct net_device *dev, struct netdev_bpf *xdp) { switch (xdp->command) { case XDP_SETUP_PROG: return bond_xdp_set(dev, xdp->prog, xdp->extack); default: return -EINVAL; } } static u32 bond_mode_bcast_speed(struct slave *slave, u32 speed) { if (speed == 0 || speed == SPEED_UNKNOWN) speed = slave->speed; else speed = min(speed, slave->speed); return speed; } /* Set the BOND_PHC_INDEX flag to notify user space */ static int bond_set_phc_index_flag(struct kernel_hwtstamp_config *kernel_cfg) { struct ifreq *ifr = kernel_cfg->ifr; struct hwtstamp_config cfg; if (kernel_cfg->copied_to_user) { /* Lower device has a legacy implementation */ if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg))) return -EFAULT; cfg.flags |= HWTSTAMP_FLAG_BONDED_PHC_INDEX; if (copy_to_user(ifr->ifr_data, &cfg, sizeof(cfg))) return -EFAULT; } else { kernel_cfg->flags |= HWTSTAMP_FLAG_BONDED_PHC_INDEX; } return 0; } static int bond_hwtstamp_get(struct net_device *dev, struct kernel_hwtstamp_config *cfg) { struct bonding *bond = netdev_priv(dev); struct net_device *real_dev; int err; real_dev = bond_option_active_slave_get_rcu(bond); if (!real_dev) return -EOPNOTSUPP; err = generic_hwtstamp_get_lower(real_dev, cfg); if (err) return err; return bond_set_phc_index_flag(cfg); } static int bond_hwtstamp_set(struct net_device *dev, struct kernel_hwtstamp_config *cfg, struct netlink_ext_ack *extack) { struct bonding *bond = netdev_priv(dev); struct net_device *real_dev; int err; if (!(cfg->flags & HWTSTAMP_FLAG_BONDED_PHC_INDEX)) return -EOPNOTSUPP; real_dev = bond_option_active_slave_get_rcu(bond); if (!real_dev) return -EOPNOTSUPP; err = generic_hwtstamp_set_lower(real_dev, cfg, extack); if (err) return err; return bond_set_phc_index_flag(cfg); } static int bond_ethtool_get_link_ksettings(struct net_device *bond_dev, struct ethtool_link_ksettings *cmd) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; u32 speed = 0; cmd->base.duplex = DUPLEX_UNKNOWN; cmd->base.port = PORT_OTHER; /* Since bond_slave_can_tx returns false for all inactive or down slaves, we * do not need to check mode. Though link speed might not represent * the true receive or transmit bandwidth (not all modes are symmetric) * this is an accurate maximum. */ bond_for_each_slave(bond, slave, iter) { if (bond_slave_can_tx(slave)) { bond_update_speed_duplex(slave); if (slave->speed != SPEED_UNKNOWN) { if (BOND_MODE(bond) == BOND_MODE_BROADCAST) speed = bond_mode_bcast_speed(slave, speed); else speed += slave->speed; } if (cmd->base.duplex == DUPLEX_UNKNOWN && slave->duplex != DUPLEX_UNKNOWN) cmd->base.duplex = slave->duplex; } } cmd->base.speed = speed ? : SPEED_UNKNOWN; return 0; } static void bond_ethtool_get_drvinfo(struct net_device *bond_dev, struct ethtool_drvinfo *drvinfo) { strscpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver)); snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "%d", BOND_ABI_VERSION); } static int bond_ethtool_get_ts_info(struct net_device *bond_dev, struct kernel_ethtool_ts_info *info) { struct bonding *bond = netdev_priv(bond_dev); struct kernel_ethtool_ts_info ts_info; struct net_device *real_dev; bool sw_tx_support = false; struct list_head *iter; struct slave *slave; int ret = 0; rcu_read_lock(); real_dev = bond_option_active_slave_get_rcu(bond); dev_hold(real_dev); rcu_read_unlock(); if (real_dev) { ret = ethtool_get_ts_info_by_layer(real_dev, info); } else { /* Check if all slaves support software tx timestamping */ rcu_read_lock(); bond_for_each_slave_rcu(bond, slave, iter) { ret = ethtool_get_ts_info_by_layer(slave->dev, &ts_info); if (!ret && (ts_info.so_timestamping & SOF_TIMESTAMPING_TX_SOFTWARE)) { sw_tx_support = true; continue; } sw_tx_support = false; break; } rcu_read_unlock(); } if (sw_tx_support) info->so_timestamping |= SOF_TIMESTAMPING_TX_SOFTWARE; dev_put(real_dev); return ret; } static const struct ethtool_ops bond_ethtool_ops = { .get_drvinfo = bond_ethtool_get_drvinfo, .get_link = ethtool_op_get_link, .get_link_ksettings = bond_ethtool_get_link_ksettings, .get_ts_info = bond_ethtool_get_ts_info, }; static const struct net_device_ops bond_netdev_ops = { .ndo_init = bond_init, .ndo_uninit = bond_uninit, .ndo_open = bond_open, .ndo_stop = bond_close, .ndo_start_xmit = bond_start_xmit, .ndo_select_queue = bond_select_queue, .ndo_get_stats64 = bond_get_stats, .ndo_eth_ioctl = bond_eth_ioctl, .ndo_siocbond = bond_do_ioctl, .ndo_siocdevprivate = bond_siocdevprivate, .ndo_change_rx_flags = bond_change_rx_flags, .ndo_set_rx_mode = bond_set_rx_mode, .ndo_change_mtu = bond_change_mtu, .ndo_set_mac_address = bond_set_mac_address, .ndo_neigh_setup = bond_neigh_setup, .ndo_vlan_rx_add_vid = bond_vlan_rx_add_vid, .ndo_vlan_rx_kill_vid = bond_vlan_rx_kill_vid, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_netpoll_setup = bond_netpoll_setup, .ndo_netpoll_cleanup = bond_netpoll_cleanup, .ndo_poll_controller = bond_poll_controller, #endif .ndo_add_slave = bond_enslave, .ndo_del_slave = bond_release, .ndo_fix_features = bond_fix_features, .ndo_features_check = passthru_features_check, .ndo_get_xmit_slave = bond_xmit_get_slave, .ndo_sk_get_lower_dev = bond_sk_get_lower_dev, .ndo_bpf = bond_xdp, .ndo_xdp_xmit = bond_xdp_xmit, .ndo_xdp_get_xmit_slave = bond_xdp_get_xmit_slave, .ndo_hwtstamp_get = bond_hwtstamp_get, .ndo_hwtstamp_set = bond_hwtstamp_set, }; static const struct device_type bond_type = { .name = "bond", }; static void bond_destructor(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); if (bond->wq) destroy_workqueue(bond->wq); free_percpu(bond->rr_tx_counter); } void bond_setup(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); spin_lock_init(&bond->mode_lock); bond->params = bonding_defaults; /* Initialize pointers */ bond->dev = bond_dev; /* Initialize the device entry points */ ether_setup(bond_dev); bond_dev->max_mtu = ETH_MAX_MTU; bond_dev->netdev_ops = &bond_netdev_ops; bond_dev->ethtool_ops = &bond_ethtool_ops; bond_dev->needs_free_netdev = true; bond_dev->priv_destructor = bond_destructor; SET_NETDEV_DEVTYPE(bond_dev, &bond_type); /* Initialize the device options */ bond_dev->flags |= IFF_MASTER; bond_dev->priv_flags |= IFF_BONDING | IFF_UNICAST_FLT | IFF_NO_QUEUE; bond_dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); #ifdef CONFIG_XFRM_OFFLOAD /* set up xfrm device ops (only supported in active-backup right now) */ bond_dev->xfrmdev_ops = &bond_xfrmdev_ops; INIT_LIST_HEAD(&bond->ipsec_list); mutex_init(&bond->ipsec_lock); #endif /* CONFIG_XFRM_OFFLOAD */ /* don't acquire bond device's netif_tx_lock when transmitting */ bond_dev->lltx = true; /* Don't allow bond devices to change network namespaces. */ bond_dev->netns_immutable = true; /* By default, we declare the bond to be fully * VLAN hardware accelerated capable. Special * care is taken in the various xmit functions * when there are slaves that are not hw accel * capable */ bond_dev->hw_features = BOND_VLAN_FEATURES | NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_RX | NETIF_F_HW_VLAN_STAG_FILTER; bond_dev->hw_features |= NETIF_F_GSO_ENCAP_ALL; bond_dev->features |= bond_dev->hw_features; bond_dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; bond_dev->features |= NETIF_F_GSO_PARTIAL; #ifdef CONFIG_XFRM_OFFLOAD bond_dev->hw_features |= BOND_XFRM_FEATURES; /* Only enable XFRM features if this is an active-backup config */ if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP) bond_dev->features |= BOND_XFRM_FEATURES; #endif /* CONFIG_XFRM_OFFLOAD */ } /* Destroy a bonding device. * Must be under rtnl_lock when this function is called. */ static void bond_uninit(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct list_head *iter; struct slave *slave; bond_netpoll_cleanup(bond_dev); /* Release the bonded slaves */ bond_for_each_slave(bond, slave, iter) __bond_release_one(bond_dev, slave->dev, true, true); netdev_info(bond_dev, "Released all slaves\n"); #ifdef CONFIG_XFRM_OFFLOAD mutex_destroy(&bond->ipsec_lock); #endif /* CONFIG_XFRM_OFFLOAD */ bond_set_slave_arr(bond, NULL, NULL); list_del_rcu(&bond->bond_list); bond_debug_unregister(bond); } /*------------------------- Module initialization ---------------------------*/ static int __init bond_check_params(struct bond_params *params) { int arp_validate_value, fail_over_mac_value, primary_reselect_value, i; struct bond_opt_value newval; const struct bond_opt_value *valptr; int arp_all_targets_value = 0; u16 ad_actor_sys_prio = 0; u16 ad_user_port_key = 0; __be32 arp_target[BOND_MAX_ARP_TARGETS] = { 0 }; int arp_ip_count; int bond_mode = BOND_MODE_ROUNDROBIN; int xmit_hashtype = BOND_XMIT_POLICY_LAYER2; int lacp_fast = 0; int tlb_dynamic_lb; /* Convert string parameters. */ if (mode) { bond_opt_initstr(&newval, mode); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_MODE), &newval); if (!valptr) { pr_err("Error: Invalid bonding mode \"%s\"\n", mode); return -EINVAL; } bond_mode = valptr->value; } if (xmit_hash_policy) { if (bond_mode == BOND_MODE_ROUNDROBIN || bond_mode == BOND_MODE_ACTIVEBACKUP || bond_mode == BOND_MODE_BROADCAST) { pr_info("xmit_hash_policy param is irrelevant in mode %s\n", bond_mode_name(bond_mode)); } else { bond_opt_initstr(&newval, xmit_hash_policy); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_XMIT_HASH), &newval); if (!valptr) { pr_err("Error: Invalid xmit_hash_policy \"%s\"\n", xmit_hash_policy); return -EINVAL; } xmit_hashtype = valptr->value; } } if (lacp_rate) { if (bond_mode != BOND_MODE_8023AD) { pr_info("lacp_rate param is irrelevant in mode %s\n", bond_mode_name(bond_mode)); } else { bond_opt_initstr(&newval, lacp_rate); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_LACP_RATE), &newval); if (!valptr) { pr_err("Error: Invalid lacp rate \"%s\"\n", lacp_rate); return -EINVAL; } lacp_fast = valptr->value; } } if (ad_select) { bond_opt_initstr(&newval, ad_select); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_SELECT), &newval); if (!valptr) { pr_err("Error: Invalid ad_select \"%s\"\n", ad_select); return -EINVAL; } params->ad_select = valptr->value; if (bond_mode != BOND_MODE_8023AD) pr_warn("ad_select param only affects 802.3ad mode\n"); } else { params->ad_select = BOND_AD_STABLE; } if (max_bonds < 0) { pr_warn("Warning: max_bonds (%d) not in range %d-%d, so it was reset to BOND_DEFAULT_MAX_BONDS (%d)\n", max_bonds, 0, INT_MAX, BOND_DEFAULT_MAX_BONDS); max_bonds = BOND_DEFAULT_MAX_BONDS; } if (miimon < 0) { pr_warn("Warning: miimon module parameter (%d), not in range 0-%d, so it was reset to 0\n", miimon, INT_MAX); miimon = 0; } if (updelay < 0) { pr_warn("Warning: updelay module parameter (%d), not in range 0-%d, so it was reset to 0\n", updelay, INT_MAX); updelay = 0; } if (downdelay < 0) { pr_warn("Warning: downdelay module parameter (%d), not in range 0-%d, so it was reset to 0\n", downdelay, INT_MAX); downdelay = 0; } if (use_carrier != 1) { pr_err("Error: invalid use_carrier parameter (%d)\n", use_carrier); return -EINVAL; } if (num_peer_notif < 0 || num_peer_notif > 255) { pr_warn("Warning: num_grat_arp/num_unsol_na (%d) not in range 0-255 so it was reset to 1\n", num_peer_notif); num_peer_notif = 1; } /* reset values for 802.3ad/TLB/ALB */ if (!bond_mode_uses_arp(bond_mode)) { if (!miimon) { pr_warn("Warning: miimon must be specified, otherwise bonding will not detect link failure, speed and duplex which are essential for 802.3ad operation\n"); pr_warn("Forcing miimon to 100msec\n"); miimon = BOND_DEFAULT_MIIMON; } } if (tx_queues < 1 || tx_queues > 255) { pr_warn("Warning: tx_queues (%d) should be between 1 and 255, resetting to %d\n", tx_queues, BOND_DEFAULT_TX_QUEUES); tx_queues = BOND_DEFAULT_TX_QUEUES; } if ((all_slaves_active != 0) && (all_slaves_active != 1)) { pr_warn("Warning: all_slaves_active module parameter (%d), not of valid value (0/1), so it was set to 0\n", all_slaves_active); all_slaves_active = 0; } if (resend_igmp < 0 || resend_igmp > 255) { pr_warn("Warning: resend_igmp (%d) should be between 0 and 255, resetting to %d\n", resend_igmp, BOND_DEFAULT_RESEND_IGMP); resend_igmp = BOND_DEFAULT_RESEND_IGMP; } bond_opt_initval(&newval, packets_per_slave); if (!bond_opt_parse(bond_opt_get(BOND_OPT_PACKETS_PER_SLAVE), &newval)) { pr_warn("Warning: packets_per_slave (%d) should be between 0 and %u resetting to 1\n", packets_per_slave, USHRT_MAX); packets_per_slave = 1; } if (bond_mode == BOND_MODE_ALB) { pr_notice("In ALB mode you might experience client disconnections upon reconnection of a link if the bonding module updelay parameter (%d msec) is incompatible with the forwarding delay time of the switch\n", updelay); } if (!miimon) { if (updelay || downdelay) { /* just warn the user the up/down delay will have * no effect since miimon is zero... */ pr_warn("Warning: miimon module parameter not set and updelay (%d) or downdelay (%d) module parameter is set; updelay and downdelay have no effect unless miimon is set\n", updelay, downdelay); } } else { /* don't allow arp monitoring */ if (arp_interval) { pr_warn("Warning: miimon (%d) and arp_interval (%d) can't be used simultaneously, disabling ARP monitoring\n", miimon, arp_interval); arp_interval = 0; } if ((updelay % miimon) != 0) { pr_warn("Warning: updelay (%d) is not a multiple of miimon (%d), updelay rounded to %d ms\n", updelay, miimon, (updelay / miimon) * miimon); } updelay /= miimon; if ((downdelay % miimon) != 0) { pr_warn("Warning: downdelay (%d) is not a multiple of miimon (%d), downdelay rounded to %d ms\n", downdelay, miimon, (downdelay / miimon) * miimon); } downdelay /= miimon; } if (arp_interval < 0) { pr_warn("Warning: arp_interval module parameter (%d), not in range 0-%d, so it was reset to 0\n", arp_interval, INT_MAX); arp_interval = 0; } for (arp_ip_count = 0, i = 0; (arp_ip_count < BOND_MAX_ARP_TARGETS) && arp_ip_target[i]; i++) { __be32 ip; /* not a complete check, but good enough to catch mistakes */ if (!in4_pton(arp_ip_target[i], -1, (u8 *)&ip, -1, NULL) || !bond_is_ip_target_ok(ip)) { pr_warn("Warning: bad arp_ip_target module parameter (%s), ARP monitoring will not be performed\n", arp_ip_target[i]); arp_interval = 0; } else { if (bond_get_targets_ip(arp_target, ip) == -1) arp_target[arp_ip_count++] = ip; else pr_warn("Warning: duplicate address %pI4 in arp_ip_target, skipping\n", &ip); } } if (arp_interval && !arp_ip_count) { /* don't allow arping if no arp_ip_target given... */ pr_warn("Warning: arp_interval module parameter (%d) specified without providing an arp_ip_target parameter, arp_interval was reset to 0\n", arp_interval); arp_interval = 0; } if (arp_validate) { if (!arp_interval) { pr_err("arp_validate requires arp_interval\n"); return -EINVAL; } bond_opt_initstr(&newval, arp_validate); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_VALIDATE), &newval); if (!valptr) { pr_err("Error: invalid arp_validate \"%s\"\n", arp_validate); return -EINVAL; } arp_validate_value = valptr->value; } else { arp_validate_value = 0; } if (arp_all_targets) { bond_opt_initstr(&newval, arp_all_targets); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_ARP_ALL_TARGETS), &newval); if (!valptr) { pr_err("Error: invalid arp_all_targets_value \"%s\"\n", arp_all_targets); arp_all_targets_value = 0; } else { arp_all_targets_value = valptr->value; } } if (miimon) { pr_info("MII link monitoring set to %d ms\n", miimon); } else if (arp_interval) { valptr = bond_opt_get_val(BOND_OPT_ARP_VALIDATE, arp_validate_value); pr_info("ARP monitoring set to %d ms, validate %s, with %d target(s):", arp_interval, valptr->string, arp_ip_count); for (i = 0; i < arp_ip_count; i++) pr_cont(" %s", arp_ip_target[i]); pr_cont("\n"); } else if (max_bonds) { /* miimon and arp_interval not set, we need one so things * work as expected, see bonding.txt for details */ pr_debug("Warning: either miimon or arp_interval and arp_ip_target module parameters must be specified, otherwise bonding will not detect link failures! see bonding.txt for details\n"); } if (primary && !bond_mode_uses_primary(bond_mode)) { /* currently, using a primary only makes sense * in active backup, TLB or ALB modes */ pr_warn("Warning: %s primary device specified but has no effect in %s mode\n", primary, bond_mode_name(bond_mode)); primary = NULL; } if (primary && primary_reselect) { bond_opt_initstr(&newval, primary_reselect); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_PRIMARY_RESELECT), &newval); if (!valptr) { pr_err("Error: Invalid primary_reselect \"%s\"\n", primary_reselect); return -EINVAL; } primary_reselect_value = valptr->value; } else { primary_reselect_value = BOND_PRI_RESELECT_ALWAYS; } if (fail_over_mac) { bond_opt_initstr(&newval, fail_over_mac); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_FAIL_OVER_MAC), &newval); if (!valptr) { pr_err("Error: invalid fail_over_mac \"%s\"\n", fail_over_mac); return -EINVAL; } fail_over_mac_value = valptr->value; if (bond_mode != BOND_MODE_ACTIVEBACKUP) pr_warn("Warning: fail_over_mac only affects active-backup mode\n"); } else { fail_over_mac_value = BOND_FOM_NONE; } bond_opt_initstr(&newval, "default"); valptr = bond_opt_parse( bond_opt_get(BOND_OPT_AD_ACTOR_SYS_PRIO), &newval); if (!valptr) { pr_err("Error: No ad_actor_sys_prio default value"); return -EINVAL; } ad_actor_sys_prio = valptr->value; valptr = bond_opt_parse(bond_opt_get(BOND_OPT_AD_USER_PORT_KEY), &newval); if (!valptr) { pr_err("Error: No ad_user_port_key default value"); return -EINVAL; } ad_user_port_key = valptr->value; bond_opt_initstr(&newval, "default"); valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), &newval); if (!valptr) { pr_err("Error: No tlb_dynamic_lb default value"); return -EINVAL; } tlb_dynamic_lb = valptr->value; if (lp_interval == 0) { pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n", INT_MAX, BOND_ALB_DEFAULT_LP_INTERVAL); lp_interval = BOND_ALB_DEFAULT_LP_INTERVAL; } /* fill params struct with the proper values */ params->mode = bond_mode; params->xmit_policy = xmit_hashtype; params->miimon = miimon; params->num_peer_notif = num_peer_notif; params->arp_interval = arp_interval; params->arp_validate = arp_validate_value; params->arp_all_targets = arp_all_targets_value; params->missed_max = 2; params->updelay = updelay; params->downdelay = downdelay; params->peer_notif_delay = 0; params->lacp_active = 1; params->lacp_fast = lacp_fast; params->primary[0] = 0; params->primary_reselect = primary_reselect_value; params->fail_over_mac = fail_over_mac_value; params->tx_queues = tx_queues; params->all_slaves_active = all_slaves_active; params->resend_igmp = resend_igmp; params->min_links = min_links; params->lp_interval = lp_interval; params->packets_per_slave = packets_per_slave; params->tlb_dynamic_lb = tlb_dynamic_lb; params->ad_actor_sys_prio = ad_actor_sys_prio; eth_zero_addr(params->ad_actor_system); params->ad_user_port_key = ad_user_port_key; params->coupled_control = 1; params->broadcast_neighbor = 0; if (packets_per_slave > 0) { params->reciprocal_packets_per_slave = reciprocal_value(packets_per_slave); } else { /* reciprocal_packets_per_slave is unused if * packets_per_slave is 0 or 1, just initialize it */ params->reciprocal_packets_per_slave = (struct reciprocal_value) { 0 }; } if (primary) strscpy_pad(params->primary, primary, sizeof(params->primary)); memcpy(params->arp_targets, arp_target, sizeof(arp_target)); #if IS_ENABLED(CONFIG_IPV6) memset(params->ns_targets, 0, sizeof(struct in6_addr) * BOND_MAX_NS_TARGETS); #endif return 0; } /* Called from registration process */ static int bond_init(struct net_device *bond_dev) { struct bonding *bond = netdev_priv(bond_dev); struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id); netdev_dbg(bond_dev, "Begin bond_init\n"); bond->wq = alloc_ordered_workqueue("%s", WQ_MEM_RECLAIM, bond_dev->name); if (!bond->wq) return -ENOMEM; bond->notifier_ctx = false; spin_lock_init(&bond->stats_lock); netdev_lockdep_set_classes(bond_dev); list_add_tail_rcu(&bond->bond_list, &bn->dev_list); bond_prepare_sysfs_group(bond); bond_debug_register(bond); /* Ensure valid dev_addr */ if (is_zero_ether_addr(bond_dev->dev_addr) && bond_dev->addr_assign_type == NET_ADDR_PERM) eth_hw_addr_random(bond_dev); return 0; } unsigned int bond_get_num_tx_queues(void) { return tx_queues; } /* Create a new bond based on the specified name and bonding parameters. * If name is NULL, obtain a suitable "bond%d" name for us. * Caller must NOT hold rtnl_lock; we need to release it here before we * set up our sysfs entries. */ int bond_create(struct net *net, const char *name) { struct net_device *bond_dev; struct bonding *bond; int res = -ENOMEM; rtnl_lock(); bond_dev = alloc_netdev_mq(sizeof(struct bonding), name ? name : "bond%d", NET_NAME_UNKNOWN, bond_setup, tx_queues); if (!bond_dev) goto out; bond = netdev_priv(bond_dev); dev_net_set(bond_dev, net); bond_dev->rtnl_link_ops = &bond_link_ops; res = register_netdevice(bond_dev); if (res < 0) { free_netdev(bond_dev); goto out; } netif_carrier_off(bond_dev); bond_work_init_all(bond); out: rtnl_unlock(); return res; } static int __net_init bond_net_init(struct net *net) { struct bond_net *bn = net_generic(net, bond_net_id); bn->net = net; INIT_LIST_HEAD(&bn->dev_list); bond_create_proc_dir(bn); bond_create_sysfs(bn); return 0; } /* According to commit 69b0216ac255 ("bonding: fix bonding_masters * race condition in bond unloading") we need to remove sysfs files * before we remove our devices (done later in bond_net_exit_rtnl()) */ static void __net_exit bond_net_pre_exit(struct net *net) { struct bond_net *bn = net_generic(net, bond_net_id); bond_destroy_sysfs(bn); } static void __net_exit bond_net_exit_rtnl(struct net *net, struct list_head *dev_kill_list) { struct bond_net *bn = net_generic(net, bond_net_id); struct bonding *bond, *tmp_bond; /* Kill off any bonds created after unregistering bond rtnl ops */ list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list) unregister_netdevice_queue(bond->dev, dev_kill_list); } /* According to commit 23fa5c2caae0 ("bonding: destroy proc directory * only after all bonds are gone") bond_destroy_proc_dir() is called * after bond_net_exit_rtnl() has completed. */ static void __net_exit bond_net_exit_batch(struct list_head *net_list) { struct bond_net *bn; struct net *net; list_for_each_entry(net, net_list, exit_list) { bn = net_generic(net, bond_net_id); bond_destroy_proc_dir(bn); } } static struct pernet_operations bond_net_ops = { .init = bond_net_init, .pre_exit = bond_net_pre_exit, .exit_rtnl = bond_net_exit_rtnl, .exit_batch = bond_net_exit_batch, .id = &bond_net_id, .size = sizeof(struct bond_net), }; static int __init bonding_init(void) { int i; int res; res = bond_check_params(&bonding_defaults); if (res) goto out; bond_create_debugfs(); res = register_pernet_subsys(&bond_net_ops); if (res) goto err_net_ops; res = bond_netlink_init(); if (res) goto err_link; for (i = 0; i < max_bonds; i++) { res = bond_create(&init_net, NULL); if (res) goto err; } skb_flow_dissector_init(&flow_keys_bonding, flow_keys_bonding_keys, ARRAY_SIZE(flow_keys_bonding_keys)); register_netdevice_notifier(&bond_netdev_notifier); out: return res; err: bond_netlink_fini(); err_link: unregister_pernet_subsys(&bond_net_ops); err_net_ops: bond_destroy_debugfs(); goto out; } static void __exit bonding_exit(void) { unregister_netdevice_notifier(&bond_netdev_notifier); bond_netlink_fini(); unregister_pernet_subsys(&bond_net_ops); bond_destroy_debugfs(); #ifdef CONFIG_NET_POLL_CONTROLLER /* Make sure we don't have an imbalance on our netpoll blocking */ WARN_ON(atomic_read(&netpoll_block_tx)); #endif } module_init(bonding_init); module_exit(bonding_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR("Thomas Davis, tadavis@lbl.gov and many others"); MODULE_IMPORT_NS("NETDEV_INTERNAL");
228 227 6 11 222 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/isofs/util.c */ #include <linux/time.h> #include "isofs.h" /* * We have to convert from a MM/DD/YY format to the Unix ctime format. * We have to take into account leap years and all of that good stuff. * Unfortunately, the kernel does not have the information on hand to * take into account daylight savings time, but it shouldn't matter. * The time stored should be localtime (with or without DST in effect), * and the timezone offset should hold the offset required to get back * to GMT. Thus we should always be correct. */ struct timespec64 iso_date(u8 *p, int flags) { int year, month, day, hour, minute, second, tz; struct timespec64 ts; if (flags & ISO_DATE_LONG_FORM) { year = (p[0] - '0') * 1000 + (p[1] - '0') * 100 + (p[2] - '0') * 10 + (p[3] - '0') - 1900; month = ((p[4] - '0') * 10 + (p[5] - '0')); day = ((p[6] - '0') * 10 + (p[7] - '0')); hour = ((p[8] - '0') * 10 + (p[9] - '0')); minute = ((p[10] - '0') * 10 + (p[11] - '0')); second = ((p[12] - '0') * 10 + (p[13] - '0')); ts.tv_nsec = ((p[14] - '0') * 10 + (p[15] - '0')) * 10000000; tz = p[16]; } else { year = p[0]; month = p[1]; day = p[2]; hour = p[3]; minute = p[4]; second = p[5]; ts.tv_nsec = 0; /* High sierra has no time zone */ tz = flags & ISO_DATE_HIGH_SIERRA ? 0 : p[6]; } if (year < 0) { ts.tv_sec = 0; } else { ts.tv_sec = mktime64(year+1900, month, day, hour, minute, second); /* sign extend */ if (tz & 0x80) tz |= (-1 << 8); /* * The timezone offset is unreliable on some disks, * so we make a sanity check. In no case is it ever * more than 13 hours from GMT, which is 52*15min. * The time is always stored in localtime with the * timezone offset being what get added to GMT to * get to localtime. Thus we need to subtract the offset * to get to true GMT, which is what we store the time * as internally. On the local system, the user may set * their timezone any way they wish, of course, so GMT * gets converted back to localtime on the receiving * system. * * NOTE: mkisofs in versions prior to mkisofs-1.10 had * the sign wrong on the timezone offset. This has now * been corrected there too, but if you are getting screwy * results this may be the explanation. If enough people * complain, a user configuration option could be added * to add the timezone offset in with the wrong sign * for 'compatibility' with older discs, but I cannot see how * it will matter that much. * * Thanks to kuhlmav@elec.canterbury.ac.nz (Volker Kuhlmann) * for pointing out the sign error. */ if (-52 <= tz && tz <= 52) ts.tv_sec -= tz * 15 * 60; } return ts; }
21 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 /* SPDX-License-Identifier: GPL-2.0 */ /* * devres.c - managed gpio resources * This file is based on kernel/irq/devres.c * * Copyright (c) 2011 John Crispin <john@phrozen.org> */ #include <linux/device/devres.h> #include <linux/err.h> #include <linux/export.h> #include <linux/gfp.h> #include <linux/types.h> #include <linux/gpio/consumer.h> #include "gpiolib.h" struct fwnode_handle; struct lock_class_key; static void devm_gpiod_release(void *desc) { gpiod_put(desc); } static void devm_gpiod_release_array(void *descs) { gpiod_put_array(descs); } /** * devm_gpiod_get - Resource-managed gpiod_get() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get(). GPIO descriptors returned from this function are * automatically disposed on driver detach. See gpiod_get() for detailed * information about behavior and return values. * * Returns: * The GPIO descriptor corresponding to the function @con_id of device * dev, %-ENOENT if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check devm_gpiod_get(struct device *dev, const char *con_id, enum gpiod_flags flags) { return devm_gpiod_get_index(dev, con_id, 0, flags); } EXPORT_SYMBOL_GPL(devm_gpiod_get); /** * devm_gpiod_get_optional - Resource-managed gpiod_get_optional() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_optional(). GPIO descriptors returned from this function * are automatically disposed on driver detach. See gpiod_get_optional() for * detailed information about behavior and return values. * * Returns: * The GPIO descriptor corresponding to the function @con_id of device * dev, NULL if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check devm_gpiod_get_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { return devm_gpiod_get_index_optional(dev, con_id, 0, flags); } EXPORT_SYMBOL_GPL(devm_gpiod_get_optional); /** * devm_gpiod_get_index - Resource-managed gpiod_get_index() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @idx: index of the GPIO to obtain in the consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_index(). GPIO descriptors returned from this function are * automatically disposed on driver detach. See gpiod_get_index() for detailed * information about behavior and return values. * * Returns: * The GPIO descriptor corresponding to the function @con_id of device * dev, %-ENOENT if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check devm_gpiod_get_index(struct device *dev, const char *con_id, unsigned int idx, enum gpiod_flags flags) { struct gpio_desc *desc; int ret; desc = gpiod_get_index(dev, con_id, idx, flags); if (IS_ERR(desc)) return desc; /* * For non-exclusive GPIO descriptors, check if this descriptor is * already under resource management by this device. */ if (flags & GPIOD_FLAGS_BIT_NONEXCLUSIVE) { bool dres; dres = devm_is_action_added(dev, devm_gpiod_release, desc); if (dres) return desc; } ret = devm_add_action_or_reset(dev, devm_gpiod_release, desc); if (ret) return ERR_PTR(ret); return desc; } EXPORT_SYMBOL_GPL(devm_gpiod_get_index); /** * devm_fwnode_gpiod_get_index - get a GPIO descriptor from a given node * @dev: GPIO consumer * @fwnode: firmware node containing GPIO reference * @con_id: function within the GPIO consumer * @index: index of the GPIO to obtain in the consumer * @flags: GPIO initialization flags * @label: label to attach to the requested GPIO * * GPIO descriptors returned from this function are automatically disposed on * driver detach. * * Returns: * The GPIO descriptor corresponding to the function @con_id of device * dev, %-ENOENT if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *devm_fwnode_gpiod_get_index(struct device *dev, struct fwnode_handle *fwnode, const char *con_id, int index, enum gpiod_flags flags, const char *label) { struct gpio_desc *desc; int ret; desc = gpiod_find_and_request(dev, fwnode, con_id, index, flags, label, false); if (IS_ERR(desc)) return desc; ret = devm_add_action_or_reset(dev, devm_gpiod_release, desc); if (ret) return ERR_PTR(ret); return desc; } EXPORT_SYMBOL_GPL(devm_fwnode_gpiod_get_index); /** * devm_gpiod_get_index_optional - Resource-managed gpiod_get_index_optional() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @index: index of the GPIO to obtain in the consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_index_optional(). GPIO descriptors returned from this * function are automatically disposed on driver detach. See * gpiod_get_index_optional() for detailed information about behavior and * return values. * * Returns: * The GPIO descriptor corresponding to the function @con_id of device * dev, %NULL if no GPIO has been assigned to the requested function, or * another IS_ERR() code if an error occurred while trying to acquire the GPIO. */ struct gpio_desc *__must_check devm_gpiod_get_index_optional(struct device *dev, const char *con_id, unsigned int index, enum gpiod_flags flags) { struct gpio_desc *desc; desc = devm_gpiod_get_index(dev, con_id, index, flags); if (gpiod_not_found(desc)) return NULL; return desc; } EXPORT_SYMBOL_GPL(devm_gpiod_get_index_optional); /** * devm_gpiod_get_array - Resource-managed gpiod_get_array() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_array(). GPIO descriptors returned from this function are * automatically disposed on driver detach. See gpiod_get_array() for detailed * information about behavior and return values. * * Returns: * The GPIO descriptors corresponding to the function @con_id of device * dev, %-ENOENT if no GPIO has been assigned to the requested function, * or another IS_ERR() code if an error occurred while trying to acquire * the GPIOs. */ struct gpio_descs *__must_check devm_gpiod_get_array(struct device *dev, const char *con_id, enum gpiod_flags flags) { struct gpio_descs *descs; int ret; descs = gpiod_get_array(dev, con_id, flags); if (IS_ERR(descs)) return descs; ret = devm_add_action_or_reset(dev, devm_gpiod_release_array, descs); if (ret) return ERR_PTR(ret); return descs; } EXPORT_SYMBOL_GPL(devm_gpiod_get_array); /** * devm_gpiod_get_array_optional - Resource-managed gpiod_get_array_optional() * @dev: GPIO consumer * @con_id: function within the GPIO consumer * @flags: optional GPIO initialization flags * * Managed gpiod_get_array_optional(). GPIO descriptors returned from this * function are automatically disposed on driver detach. * See gpiod_get_array_optional() for detailed information about behavior and * return values. * * Returns: * The GPIO descriptors corresponding to the function @con_id of device * dev, %NULL if no GPIO has been assigned to the requested function, * or another IS_ERR() code if an error occurred while trying to acquire * the GPIOs. */ struct gpio_descs *__must_check devm_gpiod_get_array_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { struct gpio_descs *descs; descs = devm_gpiod_get_array(dev, con_id, flags); if (gpiod_not_found(descs)) return NULL; return descs; } EXPORT_SYMBOL_GPL(devm_gpiod_get_array_optional); /** * devm_gpiod_put - Resource-managed gpiod_put() * @dev: GPIO consumer * @desc: GPIO descriptor to dispose of * * Dispose of a GPIO descriptor obtained with devm_gpiod_get() or * devm_gpiod_get_index(). Normally this function will not be called as the GPIO * will be disposed of by the resource management code. */ void devm_gpiod_put(struct device *dev, struct gpio_desc *desc) { devm_release_action(dev, devm_gpiod_release, desc); } EXPORT_SYMBOL_GPL(devm_gpiod_put); /** * devm_gpiod_unhinge - Remove resource management from a gpio descriptor * @dev: GPIO consumer * @desc: GPIO descriptor to remove resource management from * * *DEPRECATED* * This function should not be used. It's been provided as a workaround for * resource ownership issues in the regulator framework and should be replaced * with a better solution. * * Remove resource management from a GPIO descriptor. This is needed when * you want to hand over lifecycle management of a descriptor to another * mechanism. */ void devm_gpiod_unhinge(struct device *dev, struct gpio_desc *desc) { int ret; if (IS_ERR_OR_NULL(desc)) return; /* * If the GPIO descriptor is requested as nonexclusive, we * may call this function several times on the same descriptor * so it is OK if devres_destroy() returns -ENOENT. */ ret = devm_remove_action_nowarn(dev, devm_gpiod_release, desc); if (ret == -ENOENT) return; /* Anything else we should warn about */ WARN_ON(ret); } EXPORT_SYMBOL_GPL(devm_gpiod_unhinge); /** * devm_gpiod_put_array - Resource-managed gpiod_put_array() * @dev: GPIO consumer * @descs: GPIO descriptor array to dispose of * * Dispose of an array of GPIO descriptors obtained with devm_gpiod_get_array(). * Normally this function will not be called as the GPIOs will be disposed of * by the resource management code. */ void devm_gpiod_put_array(struct device *dev, struct gpio_descs *descs) { devm_release_action(dev, devm_gpiod_release_array, descs); } EXPORT_SYMBOL_GPL(devm_gpiod_put_array); static void devm_gpio_chip_release(void *data) { struct gpio_chip *gc = data; gpiochip_remove(gc); } /** * devm_gpiochip_add_data_with_key() - Resource managed gpiochip_add_data_with_key() * @dev: pointer to the device that gpio_chip belongs to. * @gc: the GPIO chip to register * @data: driver-private data associated with this chip * @lock_key: lockdep class for IRQ lock * @request_key: lockdep class for IRQ request * * Context: potentially before irqs will work * * The gpio chip automatically be released when the device is unbound. * * Returns: * A negative errno if the chip can't be registered, such as because the * gc->base is invalid or already associated with a different chip. * Otherwise it returns zero as a success code. */ int devm_gpiochip_add_data_with_key(struct device *dev, struct gpio_chip *gc, void *data, struct lock_class_key *lock_key, struct lock_class_key *request_key) { int ret; ret = gpiochip_add_data_with_key(gc, data, lock_key, request_key); if (ret < 0) return ret; return devm_add_action_or_reset(dev, devm_gpio_chip_release, gc); } EXPORT_SYMBOL_GPL(devm_gpiochip_add_data_with_key);
439 461 267 12 2177 32 21615 66 66 8 20040 790 341 3739 2204 213 3 1 21864 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Berkeley style UIO structures - Alan Cox 1994. */ #ifndef __LINUX_UIO_H #define __LINUX_UIO_H #include <linux/kernel.h> #include <linux/mm_types.h> #include <linux/ucopysize.h> #include <uapi/linux/uio.h> struct page; struct folio_queue; typedef unsigned int __bitwise iov_iter_extraction_t; struct kvec { void *iov_base; /* and that should *never* hold a userland pointer */ size_t iov_len; }; enum iter_type { /* iter types */ ITER_UBUF, ITER_IOVEC, ITER_BVEC, ITER_KVEC, ITER_FOLIOQ, ITER_XARRAY, ITER_DISCARD, }; #define ITER_SOURCE 1 // == WRITE #define ITER_DEST 0 // == READ struct iov_iter_state { size_t iov_offset; size_t count; unsigned long nr_segs; }; struct iov_iter { u8 iter_type; bool nofault; bool data_source; size_t iov_offset; /* * Hack alert: overlay ubuf_iovec with iovec + count, so * that the members resolve correctly regardless of the type * of iterator used. This means that you can use: * * &iter->__ubuf_iovec or iter->__iov * * interchangably for the user_backed cases, hence simplifying * some of the cases that need to deal with both. */ union { /* * This really should be a const, but we cannot do that without * also modifying any of the zero-filling iter init functions. * Leave it non-const for now, but it should be treated as such. */ struct iovec __ubuf_iovec; struct { union { /* use iter_iov() to get the current vec */ const struct iovec *__iov; const struct kvec *kvec; const struct bio_vec *bvec; const struct folio_queue *folioq; struct xarray *xarray; void __user *ubuf; }; size_t count; }; }; union { unsigned long nr_segs; u8 folioq_slot; loff_t xarray_start; }; }; typedef __u16 uio_meta_flags_t; struct uio_meta { uio_meta_flags_t flags; u16 app_tag; u64 seed; struct iov_iter iter; }; static inline const struct iovec *iter_iov(const struct iov_iter *iter) { if (iter->iter_type == ITER_UBUF) return (const struct iovec *) &iter->__ubuf_iovec; return iter->__iov; } #define iter_iov_addr(iter) (iter_iov(iter)->iov_base + (iter)->iov_offset) static inline size_t iter_iov_len(const struct iov_iter *i) { if (i->iter_type == ITER_UBUF) return i->count; return iter_iov(i)->iov_len - i->iov_offset; } static inline enum iter_type iov_iter_type(const struct iov_iter *i) { return i->iter_type; } static inline void iov_iter_save_state(struct iov_iter *iter, struct iov_iter_state *state) { state->iov_offset = iter->iov_offset; state->count = iter->count; state->nr_segs = iter->nr_segs; } static inline bool iter_is_ubuf(const struct iov_iter *i) { return iov_iter_type(i) == ITER_UBUF; } static inline bool iter_is_iovec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_IOVEC; } static inline bool iov_iter_is_kvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_KVEC; } static inline bool iov_iter_is_bvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_BVEC; } static inline bool iov_iter_is_discard(const struct iov_iter *i) { return iov_iter_type(i) == ITER_DISCARD; } static inline bool iov_iter_is_folioq(const struct iov_iter *i) { return iov_iter_type(i) == ITER_FOLIOQ; } static inline bool iov_iter_is_xarray(const struct iov_iter *i) { return iov_iter_type(i) == ITER_XARRAY; } static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->data_source ? WRITE : READ; } static inline bool user_backed_iter(const struct iov_iter *i) { return iter_is_ubuf(i) || iter_is_iovec(i); } /* * Total number of bytes covered by an iovec. * * NOTE that it is not safe to use this function until all the iovec's * segment lengths have been validated. Because the individual lengths can * overflow a size_t when added together. */ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) { unsigned long seg; size_t ret = 0; for (seg = 0; seg < nr_segs; seg++) ret += iov[seg].iov_len; return ret; } void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t bytes); size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_folio_from_iter_atomic(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_to_iter(&folio->page, offset, bytes, i); } static inline size_t copy_folio_from_iter(struct folio *folio, size_t offset, size_t bytes, struct iov_iter *i) { return copy_page_from_iter(&folio->page, offset, bytes, i); } size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i); static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, true)) return _copy_to_iter(addr, bytes, i); return 0; } static __always_inline __must_check size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_to_iter_full(const void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_to_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } static __always_inline __must_check size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (check_copy_size(addr, bytes, false)) return _copy_from_iter_nocache(addr, bytes, i); return 0; } static __always_inline __must_check bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { size_t copied = copy_from_iter_nocache(addr, bytes, i); if (likely(copied == bytes)) return true; iov_iter_revert(i, copied); return false; } #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /* * Note, users like pmem that depend on the stricter semantics of * _copy_from_iter_flushcache() than _copy_from_iter_nocache() must check for * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the * destination is flushed from the cache on return. */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_from_iter_flushcache _copy_from_iter_nocache #endif #ifdef CONFIG_ARCH_HAS_COPY_MC size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_mc_to_iter _copy_to_iter #endif size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, const struct folio_queue *folioq, unsigned int first_slot, unsigned int offset, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags); static inline size_t iov_iter_count(const struct iov_iter *i) { return i->count; } /* * Cap the iov_iter by given limit; note that the second argument is * *not* the new size - it's upper limit for such. Passing it a value * greater than the amount of data in iov_iter is fine - it'll just do * nothing in that case. */ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) { /* * count doesn't have to fit in size_t - comparison extends both * operands to u64 here and any value that would be truncated by * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ if (i->count > count) i->count = count; } /* * reexpand a previously truncated iterator; count must be no more than how much * we had shrunk it. */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { i->count = count; } static inline int iov_iter_npages_cap(struct iov_iter *i, int maxpages, size_t max_bytes) { size_t shorted = 0; int npages; if (iov_iter_count(i) > max_bytes) { shorted = iov_iter_count(i) - max_bytes; iov_iter_truncate(i, max_bytes); } npages = iov_iter_npages(i, maxpages); if (shorted) iov_iter_reexpand(i, iov_iter_count(i) + shorted); return npages; } struct iovec *iovec_from_user(const struct iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat); ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i); ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); int import_ubuf(int type, void __user *buf, size_t len, struct iov_iter *i); static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction, void __user *buf, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_UBUF, .data_source = direction, .ubuf = buf, .count = count, .nr_segs = 1 }; } /* Flags for iov_iter_get/extract_pages*() */ /* Allow P2PDMA on the extracted pages */ #define ITER_ALLOW_P2PDMA ((__force iov_iter_extraction_t)0x01) ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0); /** * iov_iter_extract_will_pin - Indicate how pages from the iterator will be retained * @iter: The iterator * * Examine the iterator and indicate by returning true or false as to how, if * at all, pages extracted from the iterator will be retained by the extraction * function. * * %true indicates that the pages will have a pin placed in them that the * caller must unpin. This is must be done for DMA/async DIO to force fork() * to forcibly copy a page for the child (the parent must retain the original * page). * * %false indicates that no measures are taken and that it's up to the caller * to retain the pages. */ static inline bool iov_iter_extract_will_pin(const struct iov_iter *iter) { return user_backed_iter(iter); } struct sg_table; ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t len, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags); #endif
135 135 296 297 297 296 135 208 210 205 5 3 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GRE_H #define __LINUX_GRE_H #include <linux/skbuff.h> #include <net/ip_tunnels.h> struct gre_base_hdr { __be16 flags; __be16 protocol; } __packed; struct gre_full_hdr { struct gre_base_hdr fixed_header; __be16 csum; __be16 reserved1; __be32 key; __be32 seq; } __packed; #define GRE_HEADER_SECTION 4 #define GREPROTO_CISCO 0 #define GREPROTO_PPTP 1 #define GREPROTO_MAX 2 #define GRE_IP_PROTO_MAX 2 struct gre_protocol { int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); }; int gre_add_protocol(const struct gre_protocol *proto, u8 version); int gre_del_protocol(const struct gre_protocol *proto, u8 version); struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type); int gre_parse_header(struct sk_buff *skb, struct tnl_ptk_info *tpi, bool *csum_err, __be16 proto, int nhs); static inline bool netif_is_gretap(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "gretap"); } static inline bool netif_is_ip6gretap(const struct net_device *dev) { return dev->rtnl_link_ops && !strcmp(dev->rtnl_link_ops->kind, "ip6gretap"); } static inline int gre_calc_hlen(const unsigned long *o_flags) { int addend = 4; if (test_bit(IP_TUNNEL_CSUM_BIT, o_flags)) addend += 4; if (test_bit(IP_TUNNEL_KEY_BIT, o_flags)) addend += 4; if (test_bit(IP_TUNNEL_SEQ_BIT, o_flags)) addend += 4; return addend; } static inline void gre_flags_to_tnl_flags(unsigned long *dst, __be16 flags) { IP_TUNNEL_DECLARE_FLAGS(res) = { }; __assign_bit(IP_TUNNEL_CSUM_BIT, res, flags & GRE_CSUM); __assign_bit(IP_TUNNEL_ROUTING_BIT, res, flags & GRE_ROUTING); __assign_bit(IP_TUNNEL_KEY_BIT, res, flags & GRE_KEY); __assign_bit(IP_TUNNEL_SEQ_BIT, res, flags & GRE_SEQ); __assign_bit(IP_TUNNEL_STRICT_BIT, res, flags & GRE_STRICT); __assign_bit(IP_TUNNEL_REC_BIT, res, flags & GRE_REC); __assign_bit(IP_TUNNEL_VERSION_BIT, res, flags & GRE_VERSION); ip_tunnel_flags_copy(dst, res); } static inline __be16 gre_tnl_flags_to_gre_flags(const unsigned long *tflags) { __be16 flags = 0; if (test_bit(IP_TUNNEL_CSUM_BIT, tflags)) flags |= GRE_CSUM; if (test_bit(IP_TUNNEL_ROUTING_BIT, tflags)) flags |= GRE_ROUTING; if (test_bit(IP_TUNNEL_KEY_BIT, tflags)) flags |= GRE_KEY; if (test_bit(IP_TUNNEL_SEQ_BIT, tflags)) flags |= GRE_SEQ; if (test_bit(IP_TUNNEL_STRICT_BIT, tflags)) flags |= GRE_STRICT; if (test_bit(IP_TUNNEL_REC_BIT, tflags)) flags |= GRE_REC; if (test_bit(IP_TUNNEL_VERSION_BIT, tflags)) flags |= GRE_VERSION; return flags; } static inline void gre_build_header(struct sk_buff *skb, int hdr_len, const unsigned long *flags, __be16 proto, __be32 key, __be32 seq) { IP_TUNNEL_DECLARE_FLAGS(cond) = { }; struct gre_base_hdr *greh; skb_push(skb, hdr_len); skb_set_inner_protocol(skb, proto); skb_reset_transport_header(skb); greh = (struct gre_base_hdr *)skb->data; greh->flags = gre_tnl_flags_to_gre_flags(flags); greh->protocol = proto; __set_bit(IP_TUNNEL_KEY_BIT, cond); __set_bit(IP_TUNNEL_CSUM_BIT, cond); __set_bit(IP_TUNNEL_SEQ_BIT, cond); if (ip_tunnel_flags_intersect(flags, cond)) { __be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4); if (test_bit(IP_TUNNEL_SEQ_BIT, flags)) { *ptr = seq; ptr--; } if (test_bit(IP_TUNNEL_KEY_BIT, flags)) { *ptr = key; ptr--; } if (test_bit(IP_TUNNEL_CSUM_BIT, flags) && !(skb_shinfo(skb)->gso_type & (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) { *ptr = 0; if (skb->ip_summed == CHECKSUM_PARTIAL) { *(__sum16 *)ptr = csum_fold(lco_csum(skb)); } else { skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = sizeof(*greh); } } } } #endif
24 24 25 52 46 1 5 2 1 1 4 4 10 6 1 1 12 28 2 28 5 28 3 1 24 34 1 2 1 1 25 2 2 12 17 13 11 15 10 14 14 7 5 2 1 1 79 1 1 52 1 35 1 1 9 9 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2016 Pablo Neira Ayuso <pablo@netfilter.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_acct.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_labels.h> #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> struct nft_ct_helper_obj { struct nf_conntrack_helper *helper4; struct nf_conntrack_helper *helper6; u8 l4proto; }; #ifdef CONFIG_NF_CONNTRACK_ZONES static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; static DEFINE_MUTEX(nft_ct_pcpu_mutex); #endif static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, enum nft_ct_keys k, enum ip_conntrack_dir d) { if (d < IP_CT_DIR_MAX) return k == NFT_CT_BYTES ? atomic64_read(&c[d].bytes) : atomic64_read(&c[d].packets); return nft_ct_get_eval_counter(c, k, IP_CT_DIR_ORIGINAL) + nft_ct_get_eval_counter(c, k, IP_CT_DIR_REPLY); } static void nft_ct_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); u32 *dest = &regs->data[priv->dreg]; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; const struct nf_conn_help *help; const struct nf_conntrack_tuple *tuple; const struct nf_conntrack_helper *helper; unsigned int state; ct = nf_ct_get(pkt->skb, &ctinfo); switch (priv->key) { case NFT_CT_STATE: if (ct) state = NF_CT_STATE_BIT(ctinfo); else if (ctinfo == IP_CT_UNTRACKED) state = NF_CT_STATE_UNTRACKED_BIT; else state = NF_CT_STATE_INVALID_BIT; *dest = state; return; default: break; } if (ct == NULL) goto err; switch (priv->key) { case NFT_CT_DIRECTION: nft_reg_store8(dest, CTINFO2DIR(ctinfo)); return; case NFT_CT_STATUS: *dest = ct->status; return; #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: *dest = READ_ONCE(ct->mark); return; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: *dest = ct->secmark; return; #endif case NFT_CT_EXPIRATION: *dest = jiffies_to_msecs(nf_ct_expires(ct)); return; case NFT_CT_HELPER: if (ct->master == NULL) goto err; help = nfct_help(ct->master); if (help == NULL) goto err; helper = rcu_dereference(help->helper); if (helper == NULL) goto err; strscpy_pad((char *)dest, helper->name, NF_CT_HELPER_NAME_LEN); return; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: { struct nf_conn_labels *labels = nf_ct_labels_find(ct); if (labels) memcpy(dest, labels->bits, NF_CT_LABELS_MAX_SIZE); else memset(dest, 0, NF_CT_LABELS_MAX_SIZE); return; } #endif case NFT_CT_BYTES: case NFT_CT_PKTS: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); u64 count = 0; if (acct) count = nft_ct_get_eval_counter(acct->counter, priv->key, priv->dir); memcpy(dest, &count, sizeof(count)); return; } case NFT_CT_AVGPKT: { const struct nf_conn_acct *acct = nf_conn_acct_find(ct); u64 avgcnt = 0, bcnt = 0, pcnt = 0; if (acct) { pcnt = nft_ct_get_eval_counter(acct->counter, NFT_CT_PKTS, priv->dir); bcnt = nft_ct_get_eval_counter(acct->counter, NFT_CT_BYTES, priv->dir); if (pcnt != 0) avgcnt = div64_u64(bcnt, pcnt); } memcpy(dest, &avgcnt, sizeof(avgcnt)); return; } case NFT_CT_L3PROTOCOL: nft_reg_store8(dest, nf_ct_l3num(ct)); return; case NFT_CT_PROTOCOL: nft_reg_store8(dest, nf_ct_protonum(ct)); return; #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: { const struct nf_conntrack_zone *zone = nf_ct_zone(ct); u16 zoneid; if (priv->dir < IP_CT_DIR_MAX) zoneid = nf_ct_zone_id(zone, priv->dir); else zoneid = zone->id; nft_reg_store16(dest, zoneid); return; } #endif case NFT_CT_ID: *dest = nf_ct_get_id(ct); return; default: break; } tuple = &ct->tuplehash[priv->dir].tuple; switch (priv->key) { case NFT_CT_SRC: memcpy(dest, tuple->src.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_DST: memcpy(dest, tuple->dst.u3.all, nf_ct_l3num(ct) == NFPROTO_IPV4 ? 4 : 16); return; case NFT_CT_PROTO_SRC: nft_reg_store16(dest, (__force u16)tuple->src.u.all); return; case NFT_CT_PROTO_DST: nft_reg_store16(dest, (__force u16)tuple->dst.u.all); return; case NFT_CT_SRC_IP: if (nf_ct_l3num(ct) != NFPROTO_IPV4) goto err; *dest = (__force __u32)tuple->src.u3.ip; return; case NFT_CT_DST_IP: if (nf_ct_l3num(ct) != NFPROTO_IPV4) goto err; *dest = (__force __u32)tuple->dst.u3.ip; return; case NFT_CT_SRC_IP6: if (nf_ct_l3num(ct) != NFPROTO_IPV6) goto err; memcpy(dest, tuple->src.u3.ip6, sizeof(struct in6_addr)); return; case NFT_CT_DST_IP6: if (nf_ct_l3num(ct) != NFPROTO_IPV6) goto err; memcpy(dest, tuple->dst.u3.ip6, sizeof(struct in6_addr)); return; default: break; } return; err: regs->verdict.code = NFT_BREAK; } #ifdef CONFIG_NF_CONNTRACK_ZONES static void nft_ct_set_zone_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct nf_conntrack_zone zone = { .dir = NF_CT_DEFAULT_ZONE_DIR }; const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; enum ip_conntrack_info ctinfo; u16 value = nft_reg_load16(&regs->data[priv->sreg]); struct nf_conn *ct; int oldcnt; ct = nf_ct_get(skb, &ctinfo); if (ct) /* already tracked */ return; zone.id = value; switch (priv->dir) { case IP_CT_DIR_ORIGINAL: zone.dir = NF_CT_ZONE_DIR_ORIG; break; case IP_CT_DIR_REPLY: zone.dir = NF_CT_ZONE_DIR_REPL; break; default: break; } ct = this_cpu_read(nft_ct_pcpu_template); __refcount_inc(&ct->ct_general.use, &oldcnt); if (likely(oldcnt == 1)) { nf_ct_zone_add(ct, &zone); } else { refcount_dec(&ct->ct_general.use); /* previous skb got queued to userspace, allocate temporary * one until percpu template can be reused. */ ct = nf_ct_tmpl_alloc(nft_net(pkt), &zone, GFP_ATOMIC); if (!ct) { regs->verdict.code = NF_DROP; return; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); } nf_ct_set(skb, ct, IP_CT_NEW); } #endif static void nft_ct_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct *priv = nft_expr_priv(expr); struct sk_buff *skb = pkt->skb; #if defined(CONFIG_NF_CONNTRACK_MARK) || defined(CONFIG_NF_CONNTRACK_SECMARK) u32 value = regs->data[priv->sreg]; #endif enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL || nf_ct_is_template(ct)) return; switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: if (READ_ONCE(ct->mark) != value) { WRITE_ONCE(ct->mark, value); nf_conntrack_event_cache(IPCT_MARK, ct); } break; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: if (ct->secmark != value) { ct->secmark = value; nf_conntrack_event_cache(IPCT_SECMARK, ct); } break; #endif #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: nf_connlabels_replace(ct, &regs->data[priv->sreg], &regs->data[priv->sreg], NF_CT_LABELS_MAX_SIZE / sizeof(u32)); break; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS case NFT_CT_EVENTMASK: { struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); u32 ctmask = regs->data[priv->sreg]; if (e) { if (e->ctmask != ctmask) e->ctmask = ctmask; break; } if (ctmask && !nf_ct_is_confirmed(ct)) nf_ct_ecache_ext_add(ct, ctmask, 0, GFP_ATOMIC); break; } #endif default: break; } } static const struct nla_policy nft_ct_policy[NFTA_CT_MAX + 1] = { [NFTA_CT_DREG] = { .type = NLA_U32 }, [NFTA_CT_KEY] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_CT_DIRECTION] = { .type = NLA_U8 }, [NFTA_CT_SREG] = { .type = NLA_U32 }, }; #ifdef CONFIG_NF_CONNTRACK_ZONES static void nft_ct_tmpl_put_pcpu(void) { struct nf_conn *ct; int cpu; for_each_possible_cpu(cpu) { ct = per_cpu(nft_ct_pcpu_template, cpu); if (!ct) break; nf_ct_put(ct); per_cpu(nft_ct_pcpu_template, cpu) = NULL; } } static bool nft_ct_tmpl_alloc_pcpu(void) { struct nf_conntrack_zone zone = { .id = 0 }; struct nf_conn *tmp; int cpu; if (nft_ct_pcpu_template_refcnt) return true; for_each_possible_cpu(cpu) { tmp = nf_ct_tmpl_alloc(&init_net, &zone, GFP_KERNEL); if (!tmp) { nft_ct_tmpl_put_pcpu(); return false; } __set_bit(IPS_CONFIRMED_BIT, &tmp->status); per_cpu(nft_ct_pcpu_template, cpu) = tmp; } return true; } #endif static int nft_ct_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); unsigned int len; int err; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); priv->dir = IP_CT_DIR_MAX; switch (priv->key) { case NFT_CT_DIRECTION: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = sizeof(u8); break; case NFT_CT_STATE: case NFT_CT_STATUS: #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: #endif case NFT_CT_EXPIRATION: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = sizeof(u32); break; #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = NF_CT_LABELS_MAX_SIZE; break; #endif case NFT_CT_HELPER: if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = NF_CT_HELPER_NAME_LEN; break; case NFT_CT_L3PROTOCOL: case NFT_CT_PROTOCOL: /* For compatibility, do not report error if NFTA_CT_DIRECTION * attribute is specified. */ len = sizeof(u8); break; case NFT_CT_SRC: case NFT_CT_DST: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; switch (ctx->family) { case NFPROTO_IPV4: len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip); break; case NFPROTO_IPV6: case NFPROTO_INET: len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6); break; default: return -EAFNOSUPPORT; } break; case NFT_CT_SRC_IP: case NFT_CT_DST_IP: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip); break; case NFT_CT_SRC_IP6: case NFT_CT_DST_IP6: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; len = sizeof_field(struct nf_conntrack_tuple, src.u3.ip6); break; case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (tb[NFTA_CT_DIRECTION] == NULL) return -EINVAL; len = sizeof_field(struct nf_conntrack_tuple, src.u.all); break; case NFT_CT_BYTES: case NFT_CT_PKTS: case NFT_CT_AVGPKT: len = sizeof(u64); break; #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: len = sizeof(u16); break; #endif case NFT_CT_ID: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof(u32); break; default: return -EOPNOTSUPP; } if (tb[NFTA_CT_DIRECTION] != NULL) { priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); switch (priv->dir) { case IP_CT_DIR_ORIGINAL: case IP_CT_DIR_REPLY: break; default: return -EINVAL; } } priv->len = len; err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); if (err < 0) return err; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) return err; if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS || priv->key == NFT_CT_AVGPKT) nf_ct_set_acct(ctx->net, true); return 0; } static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) { switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: nf_connlabels_put(ctx->net); break; #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: mutex_lock(&nft_ct_pcpu_mutex); if (--nft_ct_pcpu_template_refcnt == 0) nft_ct_tmpl_put_pcpu(); mutex_unlock(&nft_ct_pcpu_mutex); break; #endif default: break; } } static int nft_ct_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_ct *priv = nft_expr_priv(expr); unsigned int len; int err; priv->dir = IP_CT_DIR_MAX; priv->key = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (priv->key) { #ifdef CONFIG_NF_CONNTRACK_MARK case NFT_CT_MARK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof_field(struct nf_conn, mark); break; #endif #ifdef CONFIG_NF_CONNTRACK_LABELS case NFT_CT_LABELS: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = NF_CT_LABELS_MAX_SIZE; err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); if (err) return err; break; #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: mutex_lock(&nft_ct_pcpu_mutex); if (!nft_ct_tmpl_alloc_pcpu()) { mutex_unlock(&nft_ct_pcpu_mutex); return -ENOMEM; } nft_ct_pcpu_template_refcnt++; mutex_unlock(&nft_ct_pcpu_mutex); len = sizeof(u16); break; #endif #ifdef CONFIG_NF_CONNTRACK_EVENTS case NFT_CT_EVENTMASK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof(u32); break; #endif #ifdef CONFIG_NF_CONNTRACK_SECMARK case NFT_CT_SECMARK: if (tb[NFTA_CT_DIRECTION]) return -EINVAL; len = sizeof(u32); break; #endif default: return -EOPNOTSUPP; } if (tb[NFTA_CT_DIRECTION]) { priv->dir = nla_get_u8(tb[NFTA_CT_DIRECTION]); switch (priv->dir) { case IP_CT_DIR_ORIGINAL: case IP_CT_DIR_REPLY: break; default: err = -EINVAL; goto err1; } } priv->len = len; err = nft_parse_register_load(ctx, tb[NFTA_CT_SREG], &priv->sreg, len); if (err < 0) goto err1; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) goto err1; return 0; err1: __nft_ct_set_destroy(ctx, priv); return err; } static void nft_ct_get_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { nf_ct_netns_put(ctx->net, ctx->family); } static void nft_ct_set_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { struct nft_ct *priv = nft_expr_priv(expr); __nft_ct_set_destroy(ctx, priv); nf_ct_netns_put(ctx->net, ctx->family); } static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_CT_DREG, priv->dreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; switch (priv->key) { case NFT_CT_SRC: case NFT_CT_DST: case NFT_CT_SRC_IP: case NFT_CT_DST_IP: case NFT_CT_SRC_IP6: case NFT_CT_DST_IP6: case NFT_CT_PROTO_SRC: case NFT_CT_PROTO_DST: if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; break; case NFT_CT_BYTES: case NFT_CT_PKTS: case NFT_CT_AVGPKT: case NFT_CT_ZONE: if (priv->dir < IP_CT_DIR_MAX && nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } static bool nft_ct_get_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { const struct nft_ct *priv = nft_expr_priv(expr); const struct nft_ct *ct; if (!nft_reg_track_cmp(track, expr, priv->dreg)) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } ct = nft_expr_priv(track->regs[priv->dreg].selector); if (priv->key != ct->key) { nft_reg_track_update(track, expr, priv->dreg, priv->len); return false; } if (!track->regs[priv->dreg].bitwise) return true; return nft_expr_reduce_bitwise(track, expr); } static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_ct *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_CT_SREG, priv->sreg)) goto nla_put_failure; if (nla_put_be32(skb, NFTA_CT_KEY, htonl(priv->key))) goto nla_put_failure; switch (priv->key) { case NFT_CT_ZONE: if (priv->dir < IP_CT_DIR_MAX && nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } static struct nft_expr_type nft_ct_type; static const struct nft_expr_ops nft_ct_get_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_get_eval, .init = nft_ct_get_init, .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, .reduce = nft_ct_get_reduce, }; static bool nft_ct_set_reduce(struct nft_regs_track *track, const struct nft_expr *expr) { int i; for (i = 0; i < NFT_REG32_NUM; i++) { if (!track->regs[i].selector) continue; if (track->regs[i].selector->ops != &nft_ct_get_ops) continue; __nft_reg_track_cancel(track, i); } return false; } #ifdef CONFIG_MITIGATION_RETPOLINE static const struct nft_expr_ops nft_ct_get_fast_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_get_fast_eval, .init = nft_ct_get_init, .destroy = nft_ct_get_destroy, .dump = nft_ct_get_dump, .reduce = nft_ct_set_reduce, }; #endif static const struct nft_expr_ops nft_ct_set_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_set_eval, .init = nft_ct_set_init, .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, .reduce = nft_ct_set_reduce, }; #ifdef CONFIG_NF_CONNTRACK_ZONES static const struct nft_expr_ops nft_ct_set_zone_ops = { .type = &nft_ct_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_ct)), .eval = nft_ct_set_zone_eval, .init = nft_ct_set_init, .destroy = nft_ct_set_destroy, .dump = nft_ct_set_dump, .reduce = nft_ct_set_reduce, }; #endif static const struct nft_expr_ops * nft_ct_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { if (tb[NFTA_CT_KEY] == NULL) return ERR_PTR(-EINVAL); if (tb[NFTA_CT_DREG] && tb[NFTA_CT_SREG]) return ERR_PTR(-EINVAL); if (tb[NFTA_CT_DREG]) { #ifdef CONFIG_MITIGATION_RETPOLINE u32 k = ntohl(nla_get_be32(tb[NFTA_CT_KEY])); switch (k) { case NFT_CT_STATE: case NFT_CT_DIRECTION: case NFT_CT_STATUS: case NFT_CT_MARK: case NFT_CT_SECMARK: return &nft_ct_get_fast_ops; } #endif return &nft_ct_get_ops; } if (tb[NFTA_CT_SREG]) { #ifdef CONFIG_NF_CONNTRACK_ZONES if (nla_get_be32(tb[NFTA_CT_KEY]) == htonl(NFT_CT_ZONE)) return &nft_ct_set_zone_ops; #endif return &nft_ct_set_ops; } return ERR_PTR(-EINVAL); } static struct nft_expr_type nft_ct_type __read_mostly = { .name = "ct", .select_ops = nft_ct_select_ops, .policy = nft_ct_policy, .maxattr = NFTA_CT_MAX, .owner = THIS_MODULE, }; static void nft_notrack_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) { struct sk_buff *skb = pkt->skb; enum ip_conntrack_info ctinfo; struct nf_conn *ct; ct = nf_ct_get(pkt->skb, &ctinfo); /* Previously seen (loopback or untracked)? Ignore. */ if (ct || ctinfo == IP_CT_UNTRACKED) return; nf_ct_set(skb, ct, IP_CT_UNTRACKED); } static struct nft_expr_type nft_notrack_type; static const struct nft_expr_ops nft_notrack_ops = { .type = &nft_notrack_type, .size = NFT_EXPR_SIZE(0), .eval = nft_notrack_eval, .reduce = NFT_REDUCE_READONLY, }; static struct nft_expr_type nft_notrack_type __read_mostly = { .name = "notrack", .ops = &nft_notrack_ops, .owner = THIS_MODULE, }; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT static int nft_ct_timeout_parse_policy(void *timeouts, const struct nf_conntrack_l4proto *l4proto, struct net *net, const struct nlattr *attr) { struct nlattr **tb; int ret = 0; tb = kcalloc(l4proto->ctnl_timeout.nlattr_max + 1, sizeof(*tb), GFP_KERNEL); if (!tb) return -ENOMEM; ret = nla_parse_nested_deprecated(tb, l4proto->ctnl_timeout.nlattr_max, attr, l4proto->ctnl_timeout.nla_policy, NULL); if (ret < 0) goto err; ret = l4proto->ctnl_timeout.nlattr_to_obj(tb, net, timeouts); err: kfree(tb); return ret; } struct nft_ct_timeout_obj { struct nf_ct_timeout *timeout; u8 l4proto; }; static void nft_ct_timeout_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); struct nf_conn_timeout *timeout; const unsigned int *values; if (priv->l4proto != pkt->tprot) return; if (!ct || nf_ct_is_template(ct) || nf_ct_is_confirmed(ct)) return; timeout = nf_ct_timeout_find(ct); if (!timeout) { timeout = nf_ct_timeout_ext_add(ct, priv->timeout, GFP_ATOMIC); if (!timeout) { regs->verdict.code = NF_DROP; return; } } rcu_assign_pointer(timeout->timeout, priv->timeout); /* adjust the timeout as per 'new' state. ct is unconfirmed, * so the current timestamp must not be added. */ values = nf_ct_timeout_data(timeout); if (values) nf_ct_refresh(ct, values[0]); } static int nft_ct_timeout_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_ct_timeout_obj *priv = nft_obj_data(obj); const struct nf_conntrack_l4proto *l4proto; struct nf_ct_timeout *timeout; int l3num = ctx->family; __u8 l4num; int ret; if (!tb[NFTA_CT_TIMEOUT_L4PROTO] || !tb[NFTA_CT_TIMEOUT_DATA]) return -EINVAL; if (tb[NFTA_CT_TIMEOUT_L3PROTO]) l3num = ntohs(nla_get_be16(tb[NFTA_CT_TIMEOUT_L3PROTO])); l4num = nla_get_u8(tb[NFTA_CT_TIMEOUT_L4PROTO]); priv->l4proto = l4num; l4proto = nf_ct_l4proto_find(l4num); if (l4proto->l4proto != l4num) { ret = -EOPNOTSUPP; goto err_proto_put; } timeout = kzalloc(sizeof(struct nf_ct_timeout) + l4proto->ctnl_timeout.obj_size, GFP_KERNEL); if (timeout == NULL) { ret = -ENOMEM; goto err_proto_put; } ret = nft_ct_timeout_parse_policy(&timeout->data, l4proto, ctx->net, tb[NFTA_CT_TIMEOUT_DATA]); if (ret < 0) goto err_free_timeout; timeout->l3num = l3num; timeout->l4proto = l4proto; ret = nf_ct_netns_get(ctx->net, ctx->family); if (ret < 0) goto err_free_timeout; priv->timeout = timeout; return 0; err_free_timeout: kfree(timeout); err_proto_put: return ret; } static void nft_ct_timeout_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_ct_timeout_obj *priv = nft_obj_data(obj); struct nf_ct_timeout *timeout = priv->timeout; nf_ct_untimeout(ctx->net, timeout); nf_ct_netns_put(ctx->net, ctx->family); kfree(priv->timeout); } static int nft_ct_timeout_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_timeout_obj *priv = nft_obj_data(obj); const struct nf_ct_timeout *timeout = priv->timeout; struct nlattr *nest_params; int ret; if (nla_put_u8(skb, NFTA_CT_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) || nla_put_be16(skb, NFTA_CT_TIMEOUT_L3PROTO, htons(timeout->l3num))) return -1; nest_params = nla_nest_start(skb, NFTA_CT_TIMEOUT_DATA); if (!nest_params) return -1; ret = timeout->l4proto->ctnl_timeout.obj_to_nlattr(skb, &timeout->data); if (ret < 0) return -1; nla_nest_end(skb, nest_params); return 0; } static const struct nla_policy nft_ct_timeout_policy[NFTA_CT_TIMEOUT_MAX + 1] = { [NFTA_CT_TIMEOUT_L3PROTO] = {.type = NLA_U16 }, [NFTA_CT_TIMEOUT_L4PROTO] = {.type = NLA_U8 }, [NFTA_CT_TIMEOUT_DATA] = {.type = NLA_NESTED }, }; static struct nft_object_type nft_ct_timeout_obj_type; static const struct nft_object_ops nft_ct_timeout_obj_ops = { .type = &nft_ct_timeout_obj_type, .size = sizeof(struct nft_ct_timeout_obj), .eval = nft_ct_timeout_obj_eval, .init = nft_ct_timeout_obj_init, .destroy = nft_ct_timeout_obj_destroy, .dump = nft_ct_timeout_obj_dump, }; static struct nft_object_type nft_ct_timeout_obj_type __read_mostly = { .type = NFT_OBJECT_CT_TIMEOUT, .ops = &nft_ct_timeout_obj_ops, .maxattr = NFTA_CT_TIMEOUT_MAX, .policy = nft_ct_timeout_policy, .owner = THIS_MODULE, }; #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_ct_helper_obj *priv = nft_obj_data(obj); struct nf_conntrack_helper *help4, *help6; char name[NF_CT_HELPER_NAME_LEN]; int family = ctx->family; int err; if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO]) return -EINVAL; priv->l4proto = nla_get_u8(tb[NFTA_CT_HELPER_L4PROTO]); if (!priv->l4proto) return -ENOENT; nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); if (tb[NFTA_CT_HELPER_L3PROTO]) family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO])); help4 = NULL; help6 = NULL; switch (family) { case NFPROTO_IPV4: if (ctx->family == NFPROTO_IPV6) return -EINVAL; help4 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; case NFPROTO_IPV6: if (ctx->family == NFPROTO_IPV4) return -EINVAL; help6 = nf_conntrack_helper_try_module_get(name, family, priv->l4proto); break; case NFPROTO_NETDEV: case NFPROTO_BRIDGE: case NFPROTO_INET: help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4, priv->l4proto); help6 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV6, priv->l4proto); break; default: return -EAFNOSUPPORT; } /* && is intentional; only error if INET found neither ipv4 or ipv6 */ if (!help4 && !help6) return -ENOENT; priv->helper4 = help4; priv->helper6 = help6; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) goto err_put_helper; return 0; err_put_helper: if (priv->helper4) nf_conntrack_helper_put(priv->helper4); if (priv->helper6) nf_conntrack_helper_put(priv->helper6); return err; } static void nft_ct_helper_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { struct nft_ct_helper_obj *priv = nft_obj_data(obj); if (priv->helper4) nf_conntrack_helper_put(priv->helper4); if (priv->helper6) nf_conntrack_helper_put(priv->helper6); nf_ct_netns_put(ctx->net, ctx->family); } static void nft_ct_helper_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct_helper_obj *priv = nft_obj_data(obj); struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); struct nf_conntrack_helper *to_assign = NULL; struct nf_conn_help *help; if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct) || priv->l4proto != nf_ct_protonum(ct)) return; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: to_assign = priv->helper4; break; case NFPROTO_IPV6: to_assign = priv->helper6; break; default: WARN_ON_ONCE(1); return; } if (!to_assign) return; if (test_bit(IPS_HELPER_BIT, &ct->status)) return; help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (help) { rcu_assign_pointer(help->helper, to_assign); set_bit(IPS_HELPER_BIT, &ct->status); } } static int nft_ct_helper_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_helper_obj *priv = nft_obj_data(obj); const struct nf_conntrack_helper *helper; u16 family; if (priv->helper4 && priv->helper6) { family = NFPROTO_INET; helper = priv->helper4; } else if (priv->helper6) { family = NFPROTO_IPV6; helper = priv->helper6; } else { family = NFPROTO_IPV4; helper = priv->helper4; } if (nla_put_string(skb, NFTA_CT_HELPER_NAME, helper->name)) return -1; if (nla_put_u8(skb, NFTA_CT_HELPER_L4PROTO, priv->l4proto)) return -1; if (nla_put_be16(skb, NFTA_CT_HELPER_L3PROTO, htons(family))) return -1; return 0; } static const struct nla_policy nft_ct_helper_policy[NFTA_CT_HELPER_MAX + 1] = { [NFTA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN - 1 }, [NFTA_CT_HELPER_L3PROTO] = { .type = NLA_U16 }, [NFTA_CT_HELPER_L4PROTO] = { .type = NLA_U8 }, }; static struct nft_object_type nft_ct_helper_obj_type; static const struct nft_object_ops nft_ct_helper_obj_ops = { .type = &nft_ct_helper_obj_type, .size = sizeof(struct nft_ct_helper_obj), .eval = nft_ct_helper_obj_eval, .init = nft_ct_helper_obj_init, .destroy = nft_ct_helper_obj_destroy, .dump = nft_ct_helper_obj_dump, }; static struct nft_object_type nft_ct_helper_obj_type __read_mostly = { .type = NFT_OBJECT_CT_HELPER, .ops = &nft_ct_helper_obj_ops, .maxattr = NFTA_CT_HELPER_MAX, .policy = nft_ct_helper_policy, .owner = THIS_MODULE, }; struct nft_ct_expect_obj { u16 l3num; __be16 dport; u8 l4proto; u8 size; u32 timeout; }; static int nft_ct_expect_obj_init(const struct nft_ctx *ctx, const struct nlattr * const tb[], struct nft_object *obj) { struct nft_ct_expect_obj *priv = nft_obj_data(obj); if (!tb[NFTA_CT_EXPECT_L4PROTO] || !tb[NFTA_CT_EXPECT_DPORT] || !tb[NFTA_CT_EXPECT_TIMEOUT] || !tb[NFTA_CT_EXPECT_SIZE]) return -EINVAL; priv->l3num = ctx->family; if (tb[NFTA_CT_EXPECT_L3PROTO]) priv->l3num = ntohs(nla_get_be16(tb[NFTA_CT_EXPECT_L3PROTO])); switch (priv->l3num) { case NFPROTO_IPV4: case NFPROTO_IPV6: if (priv->l3num == ctx->family || ctx->family == NFPROTO_INET) break; return -EINVAL; case NFPROTO_INET: /* tuple.src.l3num supports NFPROTO_IPV4/6 only */ default: return -EAFNOSUPPORT; } priv->l4proto = nla_get_u8(tb[NFTA_CT_EXPECT_L4PROTO]); switch (priv->l4proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_DCCP: case IPPROTO_SCTP: break; default: return -EOPNOTSUPP; } priv->dport = nla_get_be16(tb[NFTA_CT_EXPECT_DPORT]); priv->timeout = nla_get_u32(tb[NFTA_CT_EXPECT_TIMEOUT]); priv->size = nla_get_u8(tb[NFTA_CT_EXPECT_SIZE]); return nf_ct_netns_get(ctx->net, ctx->family); } static void nft_ct_expect_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { nf_ct_netns_put(ctx->net, ctx->family); } static int nft_ct_expect_obj_dump(struct sk_buff *skb, struct nft_object *obj, bool reset) { const struct nft_ct_expect_obj *priv = nft_obj_data(obj); if (nla_put_be16(skb, NFTA_CT_EXPECT_L3PROTO, htons(priv->l3num)) || nla_put_u8(skb, NFTA_CT_EXPECT_L4PROTO, priv->l4proto) || nla_put_be16(skb, NFTA_CT_EXPECT_DPORT, priv->dport) || nla_put_u32(skb, NFTA_CT_EXPECT_TIMEOUT, priv->timeout) || nla_put_u8(skb, NFTA_CT_EXPECT_SIZE, priv->size)) return -1; return 0; } static void nft_ct_expect_obj_eval(struct nft_object *obj, struct nft_regs *regs, const struct nft_pktinfo *pkt) { const struct nft_ct_expect_obj *priv = nft_obj_data(obj); struct nf_conntrack_expect *exp; enum ip_conntrack_info ctinfo; struct nf_conn_help *help; enum ip_conntrack_dir dir; u16 l3num = priv->l3num; struct nf_conn *ct; ct = nf_ct_get(pkt->skb, &ctinfo); if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct)) { regs->verdict.code = NFT_BREAK; return; } dir = CTINFO2DIR(ctinfo); help = nfct_help(ct); if (!help) help = nf_ct_helper_ext_add(ct, GFP_ATOMIC); if (!help) { regs->verdict.code = NF_DROP; return; } if (help->expecting[NF_CT_EXPECT_CLASS_DEFAULT] >= priv->size) { regs->verdict.code = NFT_BREAK; return; } if (l3num == NFPROTO_INET) l3num = nf_ct_l3num(ct); exp = nf_ct_expect_alloc(ct); if (exp == NULL) { regs->verdict.code = NF_DROP; return; } nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, l3num, &ct->tuplehash[!dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3, priv->l4proto, NULL, &priv->dport); exp->timeout.expires = jiffies + priv->timeout * HZ; if (nf_ct_expect_related(exp, 0) != 0) regs->verdict.code = NF_DROP; } static const struct nla_policy nft_ct_expect_policy[NFTA_CT_EXPECT_MAX + 1] = { [NFTA_CT_EXPECT_L3PROTO] = { .type = NLA_U16 }, [NFTA_CT_EXPECT_L4PROTO] = { .type = NLA_U8 }, [NFTA_CT_EXPECT_DPORT] = { .type = NLA_U16 }, [NFTA_CT_EXPECT_TIMEOUT] = { .type = NLA_U32 }, [NFTA_CT_EXPECT_SIZE] = { .type = NLA_U8 }, }; static struct nft_object_type nft_ct_expect_obj_type; static const struct nft_object_ops nft_ct_expect_obj_ops = { .type = &nft_ct_expect_obj_type, .size = sizeof(struct nft_ct_expect_obj), .eval = nft_ct_expect_obj_eval, .init = nft_ct_expect_obj_init, .destroy = nft_ct_expect_obj_destroy, .dump = nft_ct_expect_obj_dump, }; static struct nft_object_type nft_ct_expect_obj_type __read_mostly = { .type = NFT_OBJECT_CT_EXPECT, .ops = &nft_ct_expect_obj_ops, .maxattr = NFTA_CT_EXPECT_MAX, .policy = nft_ct_expect_policy, .owner = THIS_MODULE, }; static int __init nft_ct_module_init(void) { int err; BUILD_BUG_ON(NF_CT_LABELS_MAX_SIZE > NFT_REG_SIZE); err = nft_register_expr(&nft_ct_type); if (err < 0) return err; err = nft_register_expr(&nft_notrack_type); if (err < 0) goto err1; err = nft_register_obj(&nft_ct_helper_obj_type); if (err < 0) goto err2; err = nft_register_obj(&nft_ct_expect_obj_type); if (err < 0) goto err3; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err = nft_register_obj(&nft_ct_timeout_obj_type); if (err < 0) goto err4; #endif return 0; #ifdef CONFIG_NF_CONNTRACK_TIMEOUT err4: nft_unregister_obj(&nft_ct_expect_obj_type); #endif err3: nft_unregister_obj(&nft_ct_helper_obj_type); err2: nft_unregister_expr(&nft_notrack_type); err1: nft_unregister_expr(&nft_ct_type); return err; } static void __exit nft_ct_module_exit(void) { #ifdef CONFIG_NF_CONNTRACK_TIMEOUT nft_unregister_obj(&nft_ct_timeout_obj_type); #endif nft_unregister_obj(&nft_ct_expect_obj_type); nft_unregister_obj(&nft_ct_helper_obj_type); nft_unregister_expr(&nft_notrack_type); nft_unregister_expr(&nft_ct_type); } module_init(nft_ct_module_init); module_exit(nft_ct_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_ALIAS_NFT_EXPR("ct"); MODULE_ALIAS_NFT_EXPR("notrack"); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_TIMEOUT); MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_EXPECT); MODULE_DESCRIPTION("Netfilter nf_tables conntrack module");
11046 11082 1 7865 3684 7865 10867 5 9304 7869 7867 2187 2187 10440 2187 5 10867 10861 4 10866 27 290 1 10868 27 11047 11045 40 11051 15 306 15 15 302 302 301 301 301 1 29 10855 10867 10862 76 10872 10866 10887 2187 2187 10867 10877 10869 351 353 350 4 10853 10875 10855 29 29 9 11039 29 29 11032 11083 2586 28 11096 11563 10882 11041 11066 11051 11544 11566 4 28 11540 11498 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 // SPDX-License-Identifier: GPL-2.0 #include <kunit/visibility.h> #include <linux/kernel.h> #include <linux/irqflags.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/bug.h> #include "printk_ringbuffer.h" #include "internal.h" /** * DOC: printk_ringbuffer overview * * Data Structure * -------------- * The printk_ringbuffer is made up of 3 internal ringbuffers: * * desc_ring * A ring of descriptors and their meta data (such as sequence number, * timestamp, loglevel, etc.) as well as internal state information about * the record and logical positions specifying where in the other * ringbuffer the text strings are located. * * text_data_ring * A ring of data blocks. A data block consists of an unsigned long * integer (ID) that maps to a desc_ring index followed by the text * string of the record. * * The internal state information of a descriptor is the key element to allow * readers and writers to locklessly synchronize access to the data. * * Implementation * -------------- * * Descriptor Ring * ~~~~~~~~~~~~~~~ * The descriptor ring is an array of descriptors. A descriptor contains * essential meta data to track the data of a printk record using * blk_lpos structs pointing to associated text data blocks (see * "Data Rings" below). Each descriptor is assigned an ID that maps * directly to index values of the descriptor array and has a state. The ID * and the state are bitwise combined into a single descriptor field named * @state_var, allowing ID and state to be synchronously and atomically * updated. * * Descriptors have four states: * * reserved * A writer is modifying the record. * * committed * The record and all its data are written. A writer can reopen the * descriptor (transitioning it back to reserved), but in the committed * state the data is consistent. * * finalized * The record and all its data are complete and available for reading. A * writer cannot reopen the descriptor. * * reusable * The record exists, but its text and/or meta data may no longer be * available. * * Querying the @state_var of a record requires providing the ID of the * descriptor to query. This can yield a possible fifth (pseudo) state: * * miss * The descriptor being queried has an unexpected ID. * * The descriptor ring has a @tail_id that contains the ID of the oldest * descriptor and @head_id that contains the ID of the newest descriptor. * * When a new descriptor should be created (and the ring is full), the tail * descriptor is invalidated by first transitioning to the reusable state and * then invalidating all tail data blocks up to and including the data blocks * associated with the tail descriptor (for the text ring). Then * @tail_id is advanced, followed by advancing @head_id. And finally the * @state_var of the new descriptor is initialized to the new ID and reserved * state. * * The @tail_id can only be advanced if the new @tail_id would be in the * committed or reusable queried state. This makes it possible that a valid * sequence number of the tail is always available. * * Descriptor Finalization * ~~~~~~~~~~~~~~~~~~~~~~~ * When a writer calls the commit function prb_commit(), record data is * fully stored and is consistent within the ringbuffer. However, a writer can * reopen that record, claiming exclusive access (as with prb_reserve()), and * modify that record. When finished, the writer must again commit the record. * * In order for a record to be made available to readers (and also become * recyclable for writers), it must be finalized. A finalized record cannot be * reopened and can never become "unfinalized". Record finalization can occur * in three different scenarios: * * 1) A writer can simultaneously commit and finalize its record by calling * prb_final_commit() instead of prb_commit(). * * 2) When a new record is reserved and the previous record has been * committed via prb_commit(), that previous record is automatically * finalized. * * 3) When a record is committed via prb_commit() and a newer record * already exists, the record being committed is automatically finalized. * * Data Ring * ~~~~~~~~~ * The text data ring is a byte array composed of data blocks. Data blocks are * referenced by blk_lpos structs that point to the logical position of the * beginning of a data block and the beginning of the next adjacent data * block. Logical positions are mapped directly to index values of the byte * array ringbuffer. * * Each data block consists of an ID followed by the writer data. The ID is * the identifier of a descriptor that is associated with the data block. A * given data block is considered valid if all of the following conditions * are met: * * 1) The descriptor associated with the data block is in the committed * or finalized queried state. * * 2) The blk_lpos struct within the descriptor associated with the data * block references back to the same data block. * * 3) The data block is within the head/tail logical position range. * * If the writer data of a data block would extend beyond the end of the * byte array, only the ID of the data block is stored at the logical * position and the full data block (ID and writer data) is stored at the * beginning of the byte array. The referencing blk_lpos will point to the * ID before the wrap and the next data block will be at the logical * position adjacent the full data block after the wrap. * * Data rings have a @tail_lpos that points to the beginning of the oldest * data block and a @head_lpos that points to the logical position of the * next (not yet existing) data block. * * When a new data block should be created (and the ring is full), tail data * blocks will first be invalidated by putting their associated descriptors * into the reusable state and then pushing the @tail_lpos forward beyond * them. Then the @head_lpos is pushed forward and is associated with a new * descriptor. If a data block is not valid, the @tail_lpos cannot be * advanced beyond it. * * Info Array * ~~~~~~~~~~ * The general meta data of printk records are stored in printk_info structs, * stored in an array with the same number of elements as the descriptor ring. * Each info corresponds to the descriptor of the same index in the * descriptor ring. Info validity is confirmed by evaluating the corresponding * descriptor before and after loading the info. * * Usage * ----- * Here are some simple examples demonstrating writers and readers. For the * examples a global ringbuffer (test_rb) is available (which is not the * actual ringbuffer used by printk):: * * DEFINE_PRINTKRB(test_rb, 15, 5); * * This ringbuffer allows up to 32768 records (2 ^ 15) and has a size of * 1 MiB (2 ^ (15 + 5)) for text data. * * Sample writer code:: * * const char *textstr = "message text"; * struct prb_reserved_entry e; * struct printk_record r; * * // specify how much to allocate * prb_rec_init_wr(&r, strlen(textstr) + 1); * * if (prb_reserve(&e, &test_rb, &r)) { * snprintf(r.text_buf, r.text_buf_size, "%s", textstr); * * r.info->text_len = strlen(textstr); * r.info->ts_nsec = local_clock(); * r.info->caller_id = printk_caller_id(); * * // commit and finalize the record * prb_final_commit(&e); * } * * Note that additional writer functions are available to extend a record * after it has been committed but not yet finalized. This can be done as * long as no new records have been reserved and the caller is the same. * * Sample writer code (record extending):: * * // alternate rest of previous example * * r.info->text_len = strlen(textstr); * r.info->ts_nsec = local_clock(); * r.info->caller_id = printk_caller_id(); * * // commit the record (but do not finalize yet) * prb_commit(&e); * } * * ... * * // specify additional 5 bytes text space to extend * prb_rec_init_wr(&r, 5); * * // try to extend, but only if it does not exceed 32 bytes * if (prb_reserve_in_last(&e, &test_rb, &r, printk_caller_id(), 32)) { * snprintf(&r.text_buf[r.info->text_len], * r.text_buf_size - r.info->text_len, "hello"); * * r.info->text_len += 5; * * // commit and finalize the record * prb_final_commit(&e); * } * * Sample reader code:: * * struct printk_info info; * struct printk_record r; * char text_buf[32]; * u64 seq; * * prb_rec_init_rd(&r, &info, &text_buf[0], sizeof(text_buf)); * * prb_for_each_record(0, &test_rb, &seq, &r) { * if (info.seq != seq) * pr_warn("lost %llu records\n", info.seq - seq); * * if (info.text_len > r.text_buf_size) { * pr_warn("record %llu text truncated\n", info.seq); * text_buf[r.text_buf_size - 1] = 0; * } * * pr_info("%llu: %llu: %s\n", info.seq, info.ts_nsec, * &text_buf[0]); * } * * Note that additional less convenient reader functions are available to * allow complex record access. * * ABA Issues * ~~~~~~~~~~ * To help avoid ABA issues, descriptors are referenced by IDs (array index * values combined with tagged bits counting array wraps) and data blocks are * referenced by logical positions (array index values combined with tagged * bits counting array wraps). However, on 32-bit systems the number of * tagged bits is relatively small such that an ABA incident is (at least * theoretically) possible. For example, if 4 million maximally sized (1KiB) * printk messages were to occur in NMI context on a 32-bit system, the * interrupted context would not be able to recognize that the 32-bit integer * completely wrapped and thus represents a different data block than the one * the interrupted context expects. * * To help combat this possibility, additional state checking is performed * (such as using cmpxchg() even though set() would suffice). These extra * checks are commented as such and will hopefully catch any ABA issue that * a 32-bit system might experience. * * Memory Barriers * ~~~~~~~~~~~~~~~ * Multiple memory barriers are used. To simplify proving correctness and * generating litmus tests, lines of code related to memory barriers * (loads, stores, and the associated memory barriers) are labeled:: * * LMM(function:letter) * * Comments reference the labels using only the "function:letter" part. * * The memory barrier pairs and their ordering are: * * desc_reserve:D / desc_reserve:B * push descriptor tail (id), then push descriptor head (id) * * desc_reserve:D / data_push_tail:B * push data tail (lpos), then set new descriptor reserved (state) * * desc_reserve:D / desc_push_tail:C * push descriptor tail (id), then set new descriptor reserved (state) * * desc_reserve:D / prb_first_seq:C * push descriptor tail (id), then set new descriptor reserved (state) * * desc_reserve:F / desc_read:D * set new descriptor id and reserved (state), then allow writer changes * * data_alloc:A (or data_realloc:A) / desc_read:D * set old descriptor reusable (state), then modify new data block area * * data_alloc:A (or data_realloc:A) / data_push_tail:B * push data tail (lpos), then modify new data block area * * _prb_commit:B / desc_read:B * store writer changes, then set new descriptor committed (state) * * desc_reopen_last:A / _prb_commit:B * set descriptor reserved (state), then read descriptor data * * _prb_commit:B / desc_reserve:D * set new descriptor committed (state), then check descriptor head (id) * * data_push_tail:D / data_push_tail:A * set descriptor reusable (state), then push data tail (lpos) * * desc_push_tail:B / desc_reserve:D * set descriptor reusable (state), then push descriptor tail (id) * * desc_update_last_finalized:A / desc_last_finalized_seq:A * store finalized record, then set new highest finalized sequence number */ #define DATA_SIZE(data_ring) _DATA_SIZE((data_ring)->size_bits) #define DATA_SIZE_MASK(data_ring) (DATA_SIZE(data_ring) - 1) #define DESCS_COUNT(desc_ring) _DESCS_COUNT((desc_ring)->count_bits) #define DESCS_COUNT_MASK(desc_ring) (DESCS_COUNT(desc_ring) - 1) /* Determine the data array index from a logical position. */ #define DATA_INDEX(data_ring, lpos) ((lpos) & DATA_SIZE_MASK(data_ring)) /* Determine the desc array index from an ID or sequence number. */ #define DESC_INDEX(desc_ring, n) ((n) & DESCS_COUNT_MASK(desc_ring)) /* Determine how many times the data array has wrapped. */ #define DATA_WRAPS(data_ring, lpos) ((lpos) >> (data_ring)->size_bits) /* Determine if a logical position refers to a data-less block. */ #define LPOS_DATALESS(lpos) ((lpos) & 1UL) #define BLK_DATALESS(blk) (LPOS_DATALESS((blk)->begin) && \ LPOS_DATALESS((blk)->next)) /* Get the logical position at index 0 of the current wrap. */ #define DATA_THIS_WRAP_START_LPOS(data_ring, lpos) \ ((lpos) & ~DATA_SIZE_MASK(data_ring)) /* Get the ID for the same index of the previous wrap as the given ID. */ #define DESC_ID_PREV_WRAP(desc_ring, id) \ DESC_ID((id) - DESCS_COUNT(desc_ring)) /* * A data block: mapped directly to the beginning of the data block area * specified as a logical position within the data ring. * * @id: the ID of the associated descriptor * @data: the writer data * * Note that the size of a data block is only known by its associated * descriptor. */ struct prb_data_block { unsigned long id; char data[]; }; /* * Return the descriptor associated with @n. @n can be either a * descriptor ID or a sequence number. */ static struct prb_desc *to_desc(struct prb_desc_ring *desc_ring, u64 n) { return &desc_ring->descs[DESC_INDEX(desc_ring, n)]; } /* * Return the printk_info associated with @n. @n can be either a * descriptor ID or a sequence number. */ static struct printk_info *to_info(struct prb_desc_ring *desc_ring, u64 n) { return &desc_ring->infos[DESC_INDEX(desc_ring, n)]; } static struct prb_data_block *to_block(struct prb_data_ring *data_ring, unsigned long begin_lpos) { return (void *)&data_ring->data[DATA_INDEX(data_ring, begin_lpos)]; } /* * Increase the data size to account for data block meta data plus any * padding so that the adjacent data block is aligned on the ID size. */ static unsigned int to_blk_size(unsigned int size) { struct prb_data_block *db = NULL; size += sizeof(*db); size = ALIGN(size, sizeof(db->id)); return size; } /* * Sanity checker for reserve size. The ringbuffer code assumes that a data * block does not exceed the maximum possible size that could fit within the * ringbuffer. This function provides that basic size check so that the * assumption is safe. In particular, it guarantees that data_push_tail() will * never attempt to push the tail beyond the head. */ static bool data_check_size(struct prb_data_ring *data_ring, unsigned int size) { /* Data-less blocks take no space. */ if (size == 0) return true; /* * If data blocks were allowed to be larger than half the data ring * size, a wrapping data block could require more space than the full * ringbuffer. */ return to_blk_size(size) <= DATA_SIZE(data_ring) / 2; } /* Query the state of a descriptor. */ static enum desc_state get_desc_state(unsigned long id, unsigned long state_val) { if (id != DESC_ID(state_val)) return desc_miss; return DESC_STATE(state_val); } /* * Get a copy of a specified descriptor and return its queried state. If the * descriptor is in an inconsistent state (miss or reserved), the caller can * only expect the descriptor's @state_var field to be valid. * * The sequence number and caller_id can be optionally retrieved. Like all * non-state_var data, they are only valid if the descriptor is in a * consistent state. */ static enum desc_state desc_read(struct prb_desc_ring *desc_ring, unsigned long id, struct prb_desc *desc_out, u64 *seq_out, u32 *caller_id_out) { struct printk_info *info = to_info(desc_ring, id); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; enum desc_state d_state; unsigned long state_val; /* Check the descriptor state. */ state_val = atomic_long_read(state_var); /* LMM(desc_read:A) */ d_state = get_desc_state(id, state_val); if (d_state == desc_miss || d_state == desc_reserved) { /* * The descriptor is in an inconsistent state. Set at least * @state_var so that the caller can see the details of * the inconsistent state. */ goto out; } /* * Guarantee the state is loaded before copying the descriptor * content. This avoids copying obsolete descriptor content that might * not apply to the descriptor state. This pairs with _prb_commit:B. * * Memory barrier involvement: * * If desc_read:A reads from _prb_commit:B, then desc_read:C reads * from _prb_commit:A. * * Relies on: * * WMB from _prb_commit:A to _prb_commit:B * matching * RMB from desc_read:A to desc_read:C */ smp_rmb(); /* LMM(desc_read:B) */ /* * Copy the descriptor data. The data is not valid until the * state has been re-checked. A memcpy() for all of @desc * cannot be used because of the atomic_t @state_var field. */ if (desc_out) { memcpy(&desc_out->text_blk_lpos, &desc->text_blk_lpos, sizeof(desc_out->text_blk_lpos)); /* LMM(desc_read:C) */ } if (seq_out) *seq_out = info->seq; /* also part of desc_read:C */ if (caller_id_out) *caller_id_out = info->caller_id; /* also part of desc_read:C */ /* * 1. Guarantee the descriptor content is loaded before re-checking * the state. This avoids reading an obsolete descriptor state * that may not apply to the copied content. This pairs with * desc_reserve:F. * * Memory barrier involvement: * * If desc_read:C reads from desc_reserve:G, then desc_read:E * reads from desc_reserve:F. * * Relies on: * * WMB from desc_reserve:F to desc_reserve:G * matching * RMB from desc_read:C to desc_read:E * * 2. Guarantee the record data is loaded before re-checking the * state. This avoids reading an obsolete descriptor state that may * not apply to the copied data. This pairs with data_alloc:A and * data_realloc:A. * * Memory barrier involvement: * * If copy_data:A reads from data_alloc:B, then desc_read:E * reads from desc_make_reusable:A. * * Relies on: * * MB from desc_make_reusable:A to data_alloc:B * matching * RMB from desc_read:C to desc_read:E * * Note: desc_make_reusable:A and data_alloc:B can be different * CPUs. However, the data_alloc:B CPU (which performs the * full memory barrier) must have previously seen * desc_make_reusable:A. */ smp_rmb(); /* LMM(desc_read:D) */ /* * The data has been copied. Return the current descriptor state, * which may have changed since the load above. */ state_val = atomic_long_read(state_var); /* LMM(desc_read:E) */ d_state = get_desc_state(id, state_val); out: if (desc_out) atomic_long_set(&desc_out->state_var, state_val); return d_state; } /* * Take a specified descriptor out of the finalized state by attempting * the transition from finalized to reusable. Either this context or some * other context will have been successful. */ static void desc_make_reusable(struct prb_desc_ring *desc_ring, unsigned long id) { unsigned long val_finalized = DESC_SV(id, desc_finalized); unsigned long val_reusable = DESC_SV(id, desc_reusable); struct prb_desc *desc = to_desc(desc_ring, id); atomic_long_t *state_var = &desc->state_var; atomic_long_cmpxchg_relaxed(state_var, val_finalized, val_reusable); /* LMM(desc_make_reusable:A) */ } /* * Given the text data ring, put the associated descriptor of each * data block from @lpos_begin until @lpos_end into the reusable state. * * If there is any problem making the associated descriptor reusable, either * the descriptor has not yet been finalized or another writer context has * already pushed the tail lpos past the problematic data block. Regardless, * on error the caller can re-load the tail lpos to determine the situation. */ static bool data_make_reusable(struct printk_ringbuffer *rb, unsigned long lpos_begin, unsigned long lpos_end, unsigned long *lpos_out) { struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_desc_ring *desc_ring = &rb->desc_ring; struct prb_data_block *blk; enum desc_state d_state; struct prb_desc desc; struct prb_data_blk_lpos *blk_lpos = &desc.text_blk_lpos; unsigned long id; /* Loop until @lpos_begin has advanced to or beyond @lpos_end. */ while ((lpos_end - lpos_begin) - 1 < DATA_SIZE(data_ring)) { blk = to_block(data_ring, lpos_begin); /* * Load the block ID from the data block. This is a data race * against a writer that may have newly reserved this data * area. If the loaded value matches a valid descriptor ID, * the blk_lpos of that descriptor will be checked to make * sure it points back to this data block. If the check fails, * the data area has been recycled by another writer. */ id = blk->id; /* LMM(data_make_reusable:A) */ d_state = desc_read(desc_ring, id, &desc, NULL, NULL); /* LMM(data_make_reusable:B) */ switch (d_state) { case desc_miss: case desc_reserved: case desc_committed: return false; case desc_finalized: /* * This data block is invalid if the descriptor * does not point back to it. */ if (blk_lpos->begin != lpos_begin) return false; desc_make_reusable(desc_ring, id); break; case desc_reusable: /* * This data block is invalid if the descriptor * does not point back to it. */ if (blk_lpos->begin != lpos_begin) return false; break; } /* Advance @lpos_begin to the next data block. */ lpos_begin = blk_lpos->next; } *lpos_out = lpos_begin; return true; } /* * Advance the data ring tail to at least @lpos. This function puts * descriptors into the reusable state if the tail is pushed beyond * their associated data block. */ static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos) { struct prb_data_ring *data_ring = &rb->text_data_ring; unsigned long tail_lpos_new; unsigned long tail_lpos; unsigned long next_lpos; /* If @lpos is from a data-less block, there is nothing to do. */ if (LPOS_DATALESS(lpos)) return true; /* * Any descriptor states that have transitioned to reusable due to the * data tail being pushed to this loaded value will be visible to this * CPU. This pairs with data_push_tail:D. * * Memory barrier involvement: * * If data_push_tail:A reads from data_push_tail:D, then this CPU can * see desc_make_reusable:A. * * Relies on: * * MB from desc_make_reusable:A to data_push_tail:D * matches * READFROM from data_push_tail:D to data_push_tail:A * thus * READFROM from desc_make_reusable:A to this CPU */ tail_lpos = atomic_long_read(&data_ring->tail_lpos); /* LMM(data_push_tail:A) */ /* * Loop until the tail lpos is at or beyond @lpos. This condition * may already be satisfied, resulting in no full memory barrier * from data_push_tail:D being performed. However, since this CPU * sees the new tail lpos, any descriptor states that transitioned to * the reusable state must already be visible. */ while ((lpos - tail_lpos) - 1 < DATA_SIZE(data_ring)) { /* * Make all descriptors reusable that are associated with * data blocks before @lpos. */ if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) { /* * 1. Guarantee the block ID loaded in * data_make_reusable() is performed before * reloading the tail lpos. The failed * data_make_reusable() may be due to a newly * recycled data area causing the tail lpos to * have been previously pushed. This pairs with * data_alloc:A and data_realloc:A. * * Memory barrier involvement: * * If data_make_reusable:A reads from data_alloc:B, * then data_push_tail:C reads from * data_push_tail:D. * * Relies on: * * MB from data_push_tail:D to data_alloc:B * matching * RMB from data_make_reusable:A to * data_push_tail:C * * Note: data_push_tail:D and data_alloc:B can be * different CPUs. However, the data_alloc:B * CPU (which performs the full memory * barrier) must have previously seen * data_push_tail:D. * * 2. Guarantee the descriptor state loaded in * data_make_reusable() is performed before * reloading the tail lpos. The failed * data_make_reusable() may be due to a newly * recycled descriptor causing the tail lpos to * have been previously pushed. This pairs with * desc_reserve:D. * * Memory barrier involvement: * * If data_make_reusable:B reads from * desc_reserve:F, then data_push_tail:C reads * from data_push_tail:D. * * Relies on: * * MB from data_push_tail:D to desc_reserve:F * matching * RMB from data_make_reusable:B to * data_push_tail:C * * Note: data_push_tail:D and desc_reserve:F can * be different CPUs. However, the * desc_reserve:F CPU (which performs the * full memory barrier) must have previously * seen data_push_tail:D. */ smp_rmb(); /* LMM(data_push_tail:B) */ tail_lpos_new = atomic_long_read(&data_ring->tail_lpos ); /* LMM(data_push_tail:C) */ if (tail_lpos_new == tail_lpos) return false; /* Another CPU pushed the tail. Try again. */ tail_lpos = tail_lpos_new; continue; } /* * Guarantee any descriptor states that have transitioned to * reusable are stored before pushing the tail lpos. A full * memory barrier is needed since other CPUs may have made * the descriptor states reusable. This pairs with * data_push_tail:A. */ if (atomic_long_try_cmpxchg(&data_ring->tail_lpos, &tail_lpos, next_lpos)) { /* LMM(data_push_tail:D) */ break; } } return true; } /* * Advance the desc ring tail. This function advances the tail by one * descriptor, thus invalidating the oldest descriptor. Before advancing * the tail, the tail descriptor is made reusable and all data blocks up to * and including the descriptor's data block are invalidated (i.e. the data * ring tail is pushed past the data block of the descriptor being made * reusable). */ static bool desc_push_tail(struct printk_ringbuffer *rb, unsigned long tail_id) { struct prb_desc_ring *desc_ring = &rb->desc_ring; enum desc_state d_state; struct prb_desc desc; d_state = desc_read(desc_ring, tail_id, &desc, NULL, NULL); switch (d_state) { case desc_miss: /* * If the ID is exactly 1 wrap behind the expected, it is * in the process of being reserved by another writer and * must be considered reserved. */ if (DESC_ID(atomic_long_read(&desc.state_var)) == DESC_ID_PREV_WRAP(desc_ring, tail_id)) { return false; } /* * The ID has changed. Another writer must have pushed the * tail and recycled the descriptor already. Success is * returned because the caller is only interested in the * specified tail being pushed, which it was. */ return true; case desc_reserved: case desc_committed: return false; case desc_finalized: desc_make_reusable(desc_ring, tail_id); break; case desc_reusable: break; } /* * Data blocks must be invalidated before their associated * descriptor can be made available for recycling. Invalidating * them later is not possible because there is no way to trust * data blocks once their associated descriptor is gone. */ if (!data_push_tail(rb, desc.text_blk_lpos.next)) return false; /* * Check the next descriptor after @tail_id before pushing the tail * to it because the tail must always be in a finalized or reusable * state. The implementation of prb_first_seq() relies on this. * * A successful read implies that the next descriptor is less than or * equal to @head_id so there is no risk of pushing the tail past the * head. */ d_state = desc_read(desc_ring, DESC_ID(tail_id + 1), &desc, NULL, NULL); /* LMM(desc_push_tail:A) */ if (d_state == desc_finalized || d_state == desc_reusable) { /* * Guarantee any descriptor states that have transitioned to * reusable are stored before pushing the tail ID. This allows * verifying the recycled descriptor state. A full memory * barrier is needed since other CPUs may have made the * descriptor states reusable. This pairs with desc_reserve:D. */ atomic_long_cmpxchg(&desc_ring->tail_id, tail_id, DESC_ID(tail_id + 1)); /* LMM(desc_push_tail:B) */ } else { /* * Guarantee the last state load from desc_read() is before * reloading @tail_id in order to see a new tail ID in the * case that the descriptor has been recycled. This pairs * with desc_reserve:D. * * Memory barrier involvement: * * If desc_push_tail:A reads from desc_reserve:F, then * desc_push_tail:D reads from desc_push_tail:B. * * Relies on: * * MB from desc_push_tail:B to desc_reserve:F * matching * RMB from desc_push_tail:A to desc_push_tail:D * * Note: desc_push_tail:B and desc_reserve:F can be different * CPUs. However, the desc_reserve:F CPU (which performs * the full memory barrier) must have previously seen * desc_push_tail:B. */ smp_rmb(); /* LMM(desc_push_tail:C) */ /* * Re-check the tail ID. The descriptor following @tail_id is * not in an allowed tail state. But if the tail has since * been moved by another CPU, then it does not matter. */ if (atomic_long_read(&desc_ring->tail_id) == tail_id) /* LMM(desc_push_tail:D) */ return false; } return true; } /* Reserve a new descriptor, invalidating the oldest if necessary. */ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long prev_state_val; unsigned long id_prev_wrap; struct prb_desc *desc; unsigned long head_id; unsigned long id; head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ do { id = DESC_ID(head_id + 1); id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); /* * Guarantee the head ID is read before reading the tail ID. * Since the tail ID is updated before the head ID, this * guarantees that @id_prev_wrap is never ahead of the tail * ID. This pairs with desc_reserve:D. * * Memory barrier involvement: * * If desc_reserve:A reads from desc_reserve:D, then * desc_reserve:C reads from desc_push_tail:B. * * Relies on: * * MB from desc_push_tail:B to desc_reserve:D * matching * RMB from desc_reserve:A to desc_reserve:C * * Note: desc_push_tail:B and desc_reserve:D can be different * CPUs. However, the desc_reserve:D CPU (which performs * the full memory barrier) must have previously seen * desc_push_tail:B. */ smp_rmb(); /* LMM(desc_reserve:B) */ if (id_prev_wrap == atomic_long_read(&desc_ring->tail_id )) { /* LMM(desc_reserve:C) */ /* * Make space for the new descriptor by * advancing the tail. */ if (!desc_push_tail(rb, id_prev_wrap)) return false; } /* * 1. Guarantee the tail ID is read before validating the * recycled descriptor state. A read memory barrier is * sufficient for this. This pairs with desc_push_tail:B. * * Memory barrier involvement: * * If desc_reserve:C reads from desc_push_tail:B, then * desc_reserve:E reads from desc_make_reusable:A. * * Relies on: * * MB from desc_make_reusable:A to desc_push_tail:B * matching * RMB from desc_reserve:C to desc_reserve:E * * Note: desc_make_reusable:A and desc_push_tail:B can be * different CPUs. However, the desc_push_tail:B CPU * (which performs the full memory barrier) must have * previously seen desc_make_reusable:A. * * 2. Guarantee the tail ID is stored before storing the head * ID. This pairs with desc_reserve:B. * * 3. Guarantee any data ring tail changes are stored before * recycling the descriptor. Data ring tail changes can * happen via desc_push_tail()->data_push_tail(). A full * memory barrier is needed since another CPU may have * pushed the data ring tails. This pairs with * data_push_tail:B. * * 4. Guarantee a new tail ID is stored before recycling the * descriptor. A full memory barrier is needed since * another CPU may have pushed the tail ID. This pairs * with desc_push_tail:C and this also pairs with * prb_first_seq:C. * * 5. Guarantee the head ID is stored before trying to * finalize the previous descriptor. This pairs with * _prb_commit:B. */ } while (!atomic_long_try_cmpxchg(&desc_ring->head_id, &head_id, id)); /* LMM(desc_reserve:D) */ desc = to_desc(desc_ring, id); /* * If the descriptor has been recycled, verify the old state val. * See "ABA Issues" about why this verification is performed. */ prev_state_val = atomic_long_read(&desc->state_var); /* LMM(desc_reserve:E) */ if (prev_state_val && get_desc_state(id_prev_wrap, prev_state_val) != desc_reusable) { WARN_ON_ONCE(1); return false; } /* * Assign the descriptor a new ID and set its state to reserved. * See "ABA Issues" about why cmpxchg() instead of set() is used. * * Guarantee the new descriptor ID and state is stored before making * any other changes. A write memory barrier is sufficient for this. * This pairs with desc_read:D. */ if (!atomic_long_try_cmpxchg(&desc->state_var, &prev_state_val, DESC_SV(id, desc_reserved))) { /* LMM(desc_reserve:F) */ WARN_ON_ONCE(1); return false; } /* Now data in @desc can be modified: LMM(desc_reserve:G) */ *id_out = id; return true; } /* Determine the end of a data block. */ static unsigned long get_next_lpos(struct prb_data_ring *data_ring, unsigned long lpos, unsigned int size) { unsigned long begin_lpos; unsigned long next_lpos; begin_lpos = lpos; next_lpos = lpos + size; /* First check if the data block does not wrap. */ if (DATA_WRAPS(data_ring, begin_lpos) == DATA_WRAPS(data_ring, next_lpos)) return next_lpos; /* Wrapping data blocks store their data at the beginning. */ return (DATA_THIS_WRAP_START_LPOS(data_ring, next_lpos) + size); } /* * Allocate a new data block, invalidating the oldest data block(s) * if necessary. This function also associates the data block with * a specified descriptor. */ static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, struct prb_data_blk_lpos *blk_lpos, unsigned long id) { struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_data_block *blk; unsigned long begin_lpos; unsigned long next_lpos; if (size == 0) { /* * Data blocks are not created for empty lines. Instead, the * reader will recognize these special lpos values and handle * it appropriately. */ blk_lpos->begin = EMPTY_LINE_LPOS; blk_lpos->next = EMPTY_LINE_LPOS; return NULL; } size = to_blk_size(size); begin_lpos = atomic_long_read(&data_ring->head_lpos); do { next_lpos = get_next_lpos(data_ring, begin_lpos, size); /* * data_check_size() prevents data block allocation that could * cause illegal ringbuffer states. But double check that the * used space will not be bigger than the ring buffer. Wrapped * messages need to reserve more space, see get_next_lpos(). * * Specify a data-less block when the check or the allocation * fails. */ if (WARN_ON_ONCE(next_lpos - begin_lpos > DATA_SIZE(data_ring)) || !data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) { blk_lpos->begin = FAILED_LPOS; blk_lpos->next = FAILED_LPOS; return NULL; } /* * 1. Guarantee any descriptor states that have transitioned * to reusable are stored before modifying the newly * allocated data area. A full memory barrier is needed * since other CPUs may have made the descriptor states * reusable. See data_push_tail:A about why the reusable * states are visible. This pairs with desc_read:D. * * 2. Guarantee any updated tail lpos is stored before * modifying the newly allocated data area. Another CPU may * be in data_make_reusable() and is reading a block ID * from this area. data_make_reusable() can handle reading * a garbage block ID value, but then it must be able to * load a new tail lpos. A full memory barrier is needed * since other CPUs may have updated the tail lpos. This * pairs with data_push_tail:B. */ } while (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &begin_lpos, next_lpos)); /* LMM(data_alloc:A) */ blk = to_block(data_ring, begin_lpos); blk->id = id; /* LMM(data_alloc:B) */ if (DATA_WRAPS(data_ring, begin_lpos) != DATA_WRAPS(data_ring, next_lpos)) { /* Wrapping data blocks store their data at the beginning. */ blk = to_block(data_ring, 0); /* * Store the ID on the wrapped block for consistency. * The printk_ringbuffer does not actually use it. */ blk->id = id; } blk_lpos->begin = begin_lpos; blk_lpos->next = next_lpos; return &blk->data[0]; } /* * Try to resize an existing data block associated with the descriptor * specified by @id. If the resized data block should become wrapped, it * copies the old data to the new data block. If @size yields a data block * with the same or less size, the data block is left as is. * * Fail if this is not the last allocated data block or if there is not * enough space or it is not possible make enough space. * * Return a pointer to the beginning of the entire data buffer or NULL on * failure. */ static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, struct prb_data_blk_lpos *blk_lpos, unsigned long id) { struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_data_block *blk; unsigned long head_lpos; unsigned long next_lpos; bool wrapped; /* Reallocation only works if @blk_lpos is the newest data block. */ head_lpos = atomic_long_read(&data_ring->head_lpos); if (head_lpos != blk_lpos->next) return NULL; /* Keep track if @blk_lpos was a wrapping data block. */ wrapped = (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, blk_lpos->next)); size = to_blk_size(size); next_lpos = get_next_lpos(data_ring, blk_lpos->begin, size); /* If the data block does not increase, there is nothing to do. */ if (head_lpos - next_lpos < DATA_SIZE(data_ring)) { if (wrapped) blk = to_block(data_ring, 0); else blk = to_block(data_ring, blk_lpos->begin); return &blk->data[0]; } /* * data_check_size() prevents data block reallocation that could * cause illegal ringbuffer states. But double check that the * new used space will not be bigger than the ring buffer. Wrapped * messages need to reserve more space, see get_next_lpos(). * * Specify failure when the check or the allocation fails. */ if (WARN_ON_ONCE(next_lpos - blk_lpos->begin > DATA_SIZE(data_ring)) || !data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) { return NULL; } /* The memory barrier involvement is the same as data_alloc:A. */ if (!atomic_long_try_cmpxchg(&data_ring->head_lpos, &head_lpos, next_lpos)) { /* LMM(data_realloc:A) */ return NULL; } blk = to_block(data_ring, blk_lpos->begin); if (DATA_WRAPS(data_ring, blk_lpos->begin) != DATA_WRAPS(data_ring, next_lpos)) { struct prb_data_block *old_blk = blk; /* Wrapping data blocks store their data at the beginning. */ blk = to_block(data_ring, 0); /* * Store the ID on the wrapped block for consistency. * The printk_ringbuffer does not actually use it. */ blk->id = id; if (!wrapped) { /* * Since the allocated space is now in the newly * created wrapping data block, copy the content * from the old data block. */ memcpy(&blk->data[0], &old_blk->data[0], (blk_lpos->next - blk_lpos->begin) - sizeof(blk->id)); } } blk_lpos->next = next_lpos; return &blk->data[0]; } /* Return the number of bytes used by a data block. */ static unsigned int space_used(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos) { /* Data-less blocks take no space. */ if (BLK_DATALESS(blk_lpos)) return 0; if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next)) { /* Data block does not wrap. */ return (DATA_INDEX(data_ring, blk_lpos->next) - DATA_INDEX(data_ring, blk_lpos->begin)); } /* * For wrapping data blocks, the trailing (wasted) space is * also counted. */ return (DATA_INDEX(data_ring, blk_lpos->next) + DATA_SIZE(data_ring) - DATA_INDEX(data_ring, blk_lpos->begin)); } /* * Given @blk_lpos, return a pointer to the writer data from the data block * and calculate the size of the data part. A NULL pointer is returned if * @blk_lpos specifies values that could never be legal. * * This function (used by readers) performs strict validation on the lpos * values to possibly detect bugs in the writer code. A WARN_ON_ONCE() is * triggered if an internal error is detected. */ static const char *get_data(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos, unsigned int *data_size) { struct prb_data_block *db; /* Data-less data block description. */ if (BLK_DATALESS(blk_lpos)) { /* * Records that are just empty lines are also valid, even * though they do not have a data block. For such records * explicitly return empty string data to signify success. */ if (blk_lpos->begin == EMPTY_LINE_LPOS && blk_lpos->next == EMPTY_LINE_LPOS) { *data_size = 0; return ""; } /* Data lost, invalid, or otherwise unavailable. */ return NULL; } /* Regular data block: @begin less than @next and in same wrap. */ if (DATA_WRAPS(data_ring, blk_lpos->begin) == DATA_WRAPS(data_ring, blk_lpos->next) && blk_lpos->begin < blk_lpos->next) { db = to_block(data_ring, blk_lpos->begin); *data_size = blk_lpos->next - blk_lpos->begin; /* Wrapping data block: @begin is one wrap behind @next. */ } else if (DATA_WRAPS(data_ring, blk_lpos->begin + DATA_SIZE(data_ring)) == DATA_WRAPS(data_ring, blk_lpos->next)) { db = to_block(data_ring, 0); *data_size = DATA_INDEX(data_ring, blk_lpos->next); /* Illegal block description. */ } else { WARN_ON_ONCE(1); return NULL; } /* A valid data block will always be aligned to the ID size. */ if (WARN_ON_ONCE(blk_lpos->begin != ALIGN(blk_lpos->begin, sizeof(db->id))) || WARN_ON_ONCE(blk_lpos->next != ALIGN(blk_lpos->next, sizeof(db->id)))) { return NULL; } /* A valid data block will always have at least an ID. */ if (WARN_ON_ONCE(*data_size < sizeof(db->id))) return NULL; /* Subtract block ID space from size to reflect data size. */ *data_size -= sizeof(db->id); return &db->data[0]; } /* * Attempt to transition the newest descriptor from committed back to reserved * so that the record can be modified by a writer again. This is only possible * if the descriptor is not yet finalized and the provided @caller_id matches. */ static struct prb_desc *desc_reopen_last(struct prb_desc_ring *desc_ring, u32 caller_id, unsigned long *id_out) { unsigned long prev_state_val; enum desc_state d_state; struct prb_desc desc; struct prb_desc *d; unsigned long id; u32 cid; id = atomic_long_read(&desc_ring->head_id); /* * To reduce unnecessarily reopening, first check if the descriptor * state and caller ID are correct. */ d_state = desc_read(desc_ring, id, &desc, NULL, &cid); if (d_state != desc_committed || cid != caller_id) return NULL; d = to_desc(desc_ring, id); prev_state_val = DESC_SV(id, desc_committed); /* * Guarantee the reserved state is stored before reading any * record data. A full memory barrier is needed because @state_var * modification is followed by reading. This pairs with _prb_commit:B. * * Memory barrier involvement: * * If desc_reopen_last:A reads from _prb_commit:B, then * prb_reserve_in_last:A reads from _prb_commit:A. * * Relies on: * * WMB from _prb_commit:A to _prb_commit:B * matching * MB If desc_reopen_last:A to prb_reserve_in_last:A */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, DESC_SV(id, desc_reserved))) { /* LMM(desc_reopen_last:A) */ return NULL; } *id_out = id; return d; } /** * prb_reserve_in_last() - Re-reserve and extend the space in the ringbuffer * used by the newest record. * * @e: The entry structure to setup. * @rb: The ringbuffer to re-reserve and extend data in. * @r: The record structure to allocate buffers for. * @caller_id: The caller ID of the caller (reserving writer). * @max_size: Fail if the extended size would be greater than this. * * This is the public function available to writers to re-reserve and extend * data. * * The writer specifies the text size to extend (not the new total size) by * setting the @text_buf_size field of @r. To ensure proper initialization * of @r, prb_rec_init_wr() should be used. * * This function will fail if @caller_id does not match the caller ID of the * newest record. In that case the caller must reserve new data using * prb_reserve(). * * Context: Any context. Disables local interrupts on success. * Return: true if text data could be extended, otherwise false. * * On success: * * - @r->text_buf points to the beginning of the entire text buffer. * * - @r->text_buf_size is set to the new total size of the buffer. * * - @r->info is not touched so that @r->info->text_len could be used * to append the text. * * - prb_record_text_space() can be used on @e to query the new * actually used space. * * Important: All @r->info fields will already be set with the current values * for the record. I.e. @r->info->text_len will be less than * @text_buf_size. Writers can use @r->info->text_len to know * where concatenation begins and writers should update * @r->info->text_len after concatenating. */ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r, u32 caller_id, unsigned int max_size) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info; unsigned int data_size; struct prb_desc *d; unsigned long id; local_irq_save(e->irqflags); /* Transition the newest descriptor back to the reserved state. */ d = desc_reopen_last(desc_ring, caller_id, &id); if (!d) { local_irq_restore(e->irqflags); goto fail_reopen; } /* Now the writer has exclusive access: LMM(prb_reserve_in_last:A) */ info = to_info(desc_ring, id); /* * Set the @e fields here so that prb_commit() can be used if * anything fails from now on. */ e->rb = rb; e->id = id; /* * desc_reopen_last() checked the caller_id, but there was no * exclusive access at that point. The descriptor may have * changed since then. */ if (caller_id != info->caller_id) goto fail; if (BLK_DATALESS(&d->text_blk_lpos)) { if (WARN_ON_ONCE(info->text_len != 0)) { pr_warn_once("wrong text_len value (%hu, expecting 0)\n", info->text_len); info->text_len = 0; } if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; if (r->text_buf_size > max_size) goto fail; r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); } else { if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size)) goto fail; /* * Increase the buffer size to include the original size. If * the meta data (@text_len) is not sane, use the full data * block size. */ if (WARN_ON_ONCE(info->text_len > data_size)) { pr_warn_once("wrong text_len value (%hu, expecting <=%u)\n", info->text_len, data_size); info->text_len = data_size; } r->text_buf_size += info->text_len; if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; if (r->text_buf_size > max_size) goto fail; r->text_buf = data_realloc(rb, r->text_buf_size, &d->text_blk_lpos, id); } if (r->text_buf_size && !r->text_buf) goto fail; r->info = info; e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); return true; fail: prb_commit(e); /* prb_commit() re-enabled interrupts. */ fail_reopen: /* Make it clear to the caller that the re-reserve failed. */ memset(r, 0, sizeof(*r)); return false; } /* * @last_finalized_seq value guarantees that all records up to and including * this sequence number are finalized and can be read. The only exception are * too old records which have already been overwritten. * * It is also guaranteed that @last_finalized_seq only increases. * * Be aware that finalized records following non-finalized records are not * reported because they are not yet available to the reader. For example, * a new record stored via printk() will not be available to a printer if * it follows a record that has not been finalized yet. However, once that * non-finalized record becomes finalized, @last_finalized_seq will be * appropriately updated and the full set of finalized records will be * available to the printer. And since each printk() caller will either * directly print or trigger deferred printing of all available unprinted * records, all printk() messages will get printed. */ static u64 desc_last_finalized_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long ulseq; /* * Guarantee the sequence number is loaded before loading the * associated record in order to guarantee that the record can be * seen by this CPU. This pairs with desc_update_last_finalized:A. */ ulseq = atomic_long_read_acquire(&desc_ring->last_finalized_seq ); /* LMM(desc_last_finalized_seq:A) */ return __ulseq_to_u64seq(rb, ulseq); } static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, struct printk_record *r, unsigned int *line_count); /* * Check if there are records directly following @last_finalized_seq that are * finalized. If so, update @last_finalized_seq to the latest of these * records. It is not allowed to skip over records that are not yet finalized. */ static void desc_update_last_finalized(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; u64 old_seq = desc_last_finalized_seq(rb); unsigned long oldval; unsigned long newval; u64 finalized_seq; u64 try_seq; try_again: finalized_seq = old_seq; try_seq = finalized_seq + 1; /* Try to find later finalized records. */ while (_prb_read_valid(rb, &try_seq, NULL, NULL)) { finalized_seq = try_seq; try_seq++; } /* No update needed if no later finalized record was found. */ if (finalized_seq == old_seq) return; oldval = __u64seq_to_ulseq(old_seq); newval = __u64seq_to_ulseq(finalized_seq); /* * Set the sequence number of a later finalized record that has been * seen. * * Guarantee the record data is visible to other CPUs before storing * its sequence number. This pairs with desc_last_finalized_seq:A. * * Memory barrier involvement: * * If desc_last_finalized_seq:A reads from * desc_update_last_finalized:A, then desc_read:A reads from * _prb_commit:B. * * Relies on: * * RELEASE from _prb_commit:B to desc_update_last_finalized:A * matching * ACQUIRE from desc_last_finalized_seq:A to desc_read:A * * Note: _prb_commit:B and desc_update_last_finalized:A can be * different CPUs. However, the desc_update_last_finalized:A * CPU (which performs the release) must have previously seen * _prb_commit:B. */ if (!atomic_long_try_cmpxchg_release(&desc_ring->last_finalized_seq, &oldval, newval)) { /* LMM(desc_update_last_finalized:A) */ old_seq = __ulseq_to_u64seq(rb, oldval); goto try_again; } } /* * Attempt to finalize a specified descriptor. If this fails, the descriptor * is either already final or it will finalize itself when the writer commits. */ static void desc_make_final(struct printk_ringbuffer *rb, unsigned long id) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long prev_state_val = DESC_SV(id, desc_committed); struct prb_desc *d = to_desc(desc_ring, id); if (atomic_long_try_cmpxchg_relaxed(&d->state_var, &prev_state_val, DESC_SV(id, desc_finalized))) { /* LMM(desc_make_final:A) */ desc_update_last_finalized(rb); } } /** * prb_reserve() - Reserve space in the ringbuffer. * * @e: The entry structure to setup. * @rb: The ringbuffer to reserve data in. * @r: The record structure to allocate buffers for. * * This is the public function available to writers to reserve data. * * The writer specifies the text size to reserve by setting the * @text_buf_size field of @r. To ensure proper initialization of @r, * prb_rec_init_wr() should be used. * * Context: Any context. Disables local interrupts on success. * Return: true if at least text data could be allocated, otherwise false. * * On success, the fields @info and @text_buf of @r will be set by this * function and should be filled in by the writer before committing. Also * on success, prb_record_text_space() can be used on @e to query the actual * space used for the text data block. * * Important: @info->text_len needs to be set correctly by the writer in * order for data to be readable and/or extended. Its value * is initialized to 0. */ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, struct printk_record *r) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info; struct prb_desc *d; unsigned long id; u64 seq; if (!data_check_size(&rb->text_data_ring, r->text_buf_size)) goto fail; /* * Descriptors in the reserved state act as blockers to all further * reservations once the desc_ring has fully wrapped. Disable * interrupts during the reserve/commit window in order to minimize * the likelihood of this happening. */ local_irq_save(e->irqflags); if (!desc_reserve(rb, &id)) { /* Descriptor reservation failures are tracked. */ atomic_long_inc(&rb->fail); local_irq_restore(e->irqflags); goto fail; } d = to_desc(desc_ring, id); info = to_info(desc_ring, id); /* * All @info fields (except @seq) are cleared and must be filled in * by the writer. Save @seq before clearing because it is used to * determine the new sequence number. */ seq = info->seq; memset(info, 0, sizeof(*info)); /* * Set the @e fields here so that prb_commit() can be used if * text data allocation fails. */ e->rb = rb; e->id = id; /* * Initialize the sequence number if it has "never been set". * Otherwise just increment it by a full wrap. * * @seq is considered "never been set" if it has a value of 0, * _except_ for @infos[0], which was specially setup by the ringbuffer * initializer and therefore is always considered as set. * * See the "Bootstrap" comment block in printk_ringbuffer.h for * details about how the initializer bootstraps the descriptors. */ if (seq == 0 && DESC_INDEX(desc_ring, id) != 0) info->seq = DESC_INDEX(desc_ring, id); else info->seq = seq + DESCS_COUNT(desc_ring); /* * New data is about to be reserved. Once that happens, previous * descriptors are no longer able to be extended. Finalize the * previous descriptor now so that it can be made available to * readers. (For seq==0 there is no previous descriptor.) */ if (info->seq > 0) desc_make_final(rb, DESC_ID(id - 1)); r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); /* If text data allocation fails, a data-less record is committed. */ if (r->text_buf_size && !r->text_buf) { prb_commit(e); /* prb_commit() re-enabled interrupts. */ goto fail; } r->info = info; /* Record full text space used by record. */ e->text_space = space_used(&rb->text_data_ring, &d->text_blk_lpos); return true; fail: /* Make it clear to the caller that the reserve failed. */ memset(r, 0, sizeof(*r)); return false; } EXPORT_SYMBOL_IF_KUNIT(prb_reserve); /* Commit the data (possibly finalizing it) and restore interrupts. */ static void _prb_commit(struct prb_reserved_entry *e, unsigned long state_val) { struct prb_desc_ring *desc_ring = &e->rb->desc_ring; struct prb_desc *d = to_desc(desc_ring, e->id); unsigned long prev_state_val = DESC_SV(e->id, desc_reserved); /* Now the writer has finished all writing: LMM(_prb_commit:A) */ /* * Set the descriptor as committed. See "ABA Issues" about why * cmpxchg() instead of set() is used. * * 1 Guarantee all record data is stored before the descriptor state * is stored as committed. A write memory barrier is sufficient * for this. This pairs with desc_read:B and desc_reopen_last:A. * * 2. Guarantee the descriptor state is stored as committed before * re-checking the head ID in order to possibly finalize this * descriptor. This pairs with desc_reserve:D. * * Memory barrier involvement: * * If prb_commit:A reads from desc_reserve:D, then * desc_make_final:A reads from _prb_commit:B. * * Relies on: * * MB _prb_commit:B to prb_commit:A * matching * MB desc_reserve:D to desc_make_final:A */ if (!atomic_long_try_cmpxchg(&d->state_var, &prev_state_val, DESC_SV(e->id, state_val))) { /* LMM(_prb_commit:B) */ WARN_ON_ONCE(1); } /* Restore interrupts, the reserve/commit window is finished. */ local_irq_restore(e->irqflags); } /** * prb_commit() - Commit (previously reserved) data to the ringbuffer. * * @e: The entry containing the reserved data information. * * This is the public function available to writers to commit data. * * Note that the data is not yet available to readers until it is finalized. * Finalizing happens automatically when space for the next record is * reserved. * * See prb_final_commit() for a version of this function that finalizes * immediately. * * Context: Any context. Enables local interrupts. */ void prb_commit(struct prb_reserved_entry *e) { struct prb_desc_ring *desc_ring = &e->rb->desc_ring; unsigned long head_id; _prb_commit(e, desc_committed); /* * If this descriptor is no longer the head (i.e. a new record has * been allocated), extending the data for this record is no longer * allowed and therefore it must be finalized. */ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_commit:A) */ if (head_id != e->id) desc_make_final(e->rb, e->id); } EXPORT_SYMBOL_IF_KUNIT(prb_commit); /** * prb_final_commit() - Commit and finalize (previously reserved) data to * the ringbuffer. * * @e: The entry containing the reserved data information. * * This is the public function available to writers to commit+finalize data. * * By finalizing, the data is made immediately available to readers. * * This function should only be used if there are no intentions of extending * this data using prb_reserve_in_last(). * * Context: Any context. Enables local interrupts. */ void prb_final_commit(struct prb_reserved_entry *e) { _prb_commit(e, desc_finalized); desc_update_last_finalized(e->rb); } /* * Count the number of lines in provided text. All text has at least 1 line * (even if @text_size is 0). Each '\n' processed is counted as an additional * line. */ static unsigned int count_lines(const char *text, unsigned int text_size) { unsigned int next_size = text_size; unsigned int line_count = 1; const char *next = text; while (next_size) { next = memchr(next, '\n', next_size); if (!next) break; line_count++; next++; next_size = text_size - (next - text); } return line_count; } /* * Given @blk_lpos, copy an expected @len of data into the provided buffer. * If @line_count is provided, count the number of lines in the data. * * This function (used by readers) performs strict validation on the data * size to possibly detect bugs in the writer code. A WARN_ON_ONCE() is * triggered if an internal error is detected. */ static bool copy_data(struct prb_data_ring *data_ring, struct prb_data_blk_lpos *blk_lpos, u16 len, char *buf, unsigned int buf_size, unsigned int *line_count) { unsigned int data_size; const char *data; /* Caller might not want any data. */ if ((!buf || !buf_size) && !line_count) return true; data = get_data(data_ring, blk_lpos, &data_size); if (!data) return false; /* * Actual cannot be less than expected. It can be more than expected * because of the trailing alignment padding. * * Note that invalid @len values can occur because the caller loads * the value during an allowed data race. */ if (data_size < (unsigned int)len) return false; /* Caller interested in the line count? */ if (line_count) *line_count = count_lines(data, len); /* Caller interested in the data content? */ if (!buf || !buf_size) return true; data_size = min_t(unsigned int, buf_size, len); memcpy(&buf[0], data, data_size); /* LMM(copy_data:A) */ return true; } /* * This is an extended version of desc_read(). It gets a copy of a specified * descriptor. However, it also verifies that the record is finalized and has * the sequence number @seq. On success, 0 is returned. * * Error return values: * -EINVAL: A finalized record with sequence number @seq does not exist. * -ENOENT: A finalized record with sequence number @seq exists, but its data * is not available. This is a valid record, so readers should * continue with the next record. */ static int desc_read_finalized_seq(struct prb_desc_ring *desc_ring, unsigned long id, u64 seq, struct prb_desc *desc_out) { struct prb_data_blk_lpos *blk_lpos = &desc_out->text_blk_lpos; enum desc_state d_state; u64 s; d_state = desc_read(desc_ring, id, desc_out, &s, NULL); /* * An unexpected @id (desc_miss) or @seq mismatch means the record * does not exist. A descriptor in the reserved or committed state * means the record does not yet exist for the reader. */ if (d_state == desc_miss || d_state == desc_reserved || d_state == desc_committed || s != seq) { return -EINVAL; } /* * A descriptor in the reusable state may no longer have its data * available; report it as existing but with lost data. Or the record * may actually be a record with lost data. */ if (d_state == desc_reusable || (blk_lpos->begin == FAILED_LPOS && blk_lpos->next == FAILED_LPOS)) { return -ENOENT; } return 0; } /* * Copy the ringbuffer data from the record with @seq to the provided * @r buffer. On success, 0 is returned. * * See desc_read_finalized_seq() for error return values. */ static int prb_read(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r, unsigned int *line_count) { struct prb_desc_ring *desc_ring = &rb->desc_ring; struct printk_info *info = to_info(desc_ring, seq); struct prb_desc *rdesc = to_desc(desc_ring, seq); atomic_long_t *state_var = &rdesc->state_var; struct prb_desc desc; unsigned long id; int err; /* Extract the ID, used to specify the descriptor to read. */ id = DESC_ID(atomic_long_read(state_var)); /* Get a local copy of the correct descriptor (if available). */ err = desc_read_finalized_seq(desc_ring, id, seq, &desc); /* * If @r is NULL, the caller is only interested in the availability * of the record. */ if (err || !r) return err; /* If requested, copy meta data. */ if (r->info) memcpy(r->info, info, sizeof(*(r->info))); /* Copy text data. If it fails, this is a data-less record. */ if (!copy_data(&rb->text_data_ring, &desc.text_blk_lpos, info->text_len, r->text_buf, r->text_buf_size, line_count)) { return -ENOENT; } /* Ensure the record is still finalized and has the same @seq. */ return desc_read_finalized_seq(desc_ring, id, seq, &desc); } /* Get the sequence number of the tail descriptor. */ u64 prb_first_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; enum desc_state d_state; struct prb_desc desc; unsigned long id; u64 seq; for (;;) { id = atomic_long_read(&rb->desc_ring.tail_id); /* LMM(prb_first_seq:A) */ d_state = desc_read(desc_ring, id, &desc, &seq, NULL); /* LMM(prb_first_seq:B) */ /* * This loop will not be infinite because the tail is * _always_ in the finalized or reusable state. */ if (d_state == desc_finalized || d_state == desc_reusable) break; /* * Guarantee the last state load from desc_read() is before * reloading @tail_id in order to see a new tail in the case * that the descriptor has been recycled. This pairs with * desc_reserve:D. * * Memory barrier involvement: * * If prb_first_seq:B reads from desc_reserve:F, then * prb_first_seq:A reads from desc_push_tail:B. * * Relies on: * * MB from desc_push_tail:B to desc_reserve:F * matching * RMB prb_first_seq:B to prb_first_seq:A */ smp_rmb(); /* LMM(prb_first_seq:C) */ } return seq; } /** * prb_next_reserve_seq() - Get the sequence number after the most recently * reserved record. * * @rb: The ringbuffer to get the sequence number from. * * This is the public function available to readers to see what sequence * number will be assigned to the next reserved record. * * Note that depending on the situation, this value can be equal to or * higher than the sequence number returned by prb_next_seq(). * * Context: Any context. * Return: The sequence number that will be assigned to the next record * reserved. */ u64 prb_next_reserve_seq(struct printk_ringbuffer *rb) { struct prb_desc_ring *desc_ring = &rb->desc_ring; unsigned long last_finalized_id; atomic_long_t *state_var; u64 last_finalized_seq; unsigned long head_id; struct prb_desc desc; unsigned long diff; struct prb_desc *d; int err; /* * It may not be possible to read a sequence number for @head_id. * So the ID of @last_finailzed_seq is used to calculate what the * sequence number of @head_id will be. */ try_again: last_finalized_seq = desc_last_finalized_seq(rb); /* * @head_id is loaded after @last_finalized_seq to ensure that * it points to the record with @last_finalized_seq or newer. * * Memory barrier involvement: * * If desc_last_finalized_seq:A reads from * desc_update_last_finalized:A, then * prb_next_reserve_seq:A reads from desc_reserve:D. * * Relies on: * * RELEASE from desc_reserve:D to desc_update_last_finalized:A * matching * ACQUIRE from desc_last_finalized_seq:A to prb_next_reserve_seq:A * * Note: desc_reserve:D and desc_update_last_finalized:A can be * different CPUs. However, the desc_update_last_finalized:A CPU * (which performs the release) must have previously seen * desc_read:C, which implies desc_reserve:D can be seen. */ head_id = atomic_long_read(&desc_ring->head_id); /* LMM(prb_next_reserve_seq:A) */ d = to_desc(desc_ring, last_finalized_seq); state_var = &d->state_var; /* Extract the ID, used to specify the descriptor to read. */ last_finalized_id = DESC_ID(atomic_long_read(state_var)); /* Ensure @last_finalized_id is correct. */ err = desc_read_finalized_seq(desc_ring, last_finalized_id, last_finalized_seq, &desc); if (err == -EINVAL) { if (last_finalized_seq == 0) { /* * No record has been finalized or even reserved yet. * * The @head_id is initialized such that the first * increment will yield the first record (seq=0). * Handle it separately to avoid a negative @diff * below. */ if (head_id == DESC0_ID(desc_ring->count_bits)) return 0; /* * One or more descriptors are already reserved. Use * the descriptor ID of the first one (@seq=0) for * the @diff below. */ last_finalized_id = DESC0_ID(desc_ring->count_bits) + 1; } else { /* Record must have been overwritten. Try again. */ goto try_again; } } /* Diff of known descriptor IDs to compute related sequence numbers. */ diff = head_id - last_finalized_id; /* * @head_id points to the most recently reserved record, but this * function returns the sequence number that will be assigned to the * next (not yet reserved) record. Thus +1 is needed. */ return (last_finalized_seq + diff + 1); } /* * Non-blocking read of a record. * * On success @seq is updated to the record that was read and (if provided) * @r and @line_count will contain the read/calculated data. * * On failure @seq is updated to a record that is not yet available to the * reader, but it will be the next record available to the reader. * * Note: When the current CPU is in panic, this function will skip over any * non-existent/non-finalized records in order to allow the panic CPU * to print any and all records that have been finalized. */ static bool _prb_read_valid(struct printk_ringbuffer *rb, u64 *seq, struct printk_record *r, unsigned int *line_count) { u64 tail_seq; int err; while ((err = prb_read(rb, *seq, r, line_count))) { tail_seq = prb_first_seq(rb); if (*seq < tail_seq) { /* * Behind the tail. Catch up and try again. This * can happen for -ENOENT and -EINVAL cases. */ *seq = tail_seq; } else if (err == -ENOENT) { /* Record exists, but the data was lost. Skip. */ (*seq)++; } else { /* * Non-existent/non-finalized record. Must stop. * * For panic situations it cannot be expected that * non-finalized records will become finalized. But * there may be other finalized records beyond that * need to be printed for a panic situation. If this * is the panic CPU, skip this * non-existent/non-finalized record unless non-panic * CPUs are still running and their debugging is * explicitly enabled. * * Note that new messages printed on panic CPU are * finalized when we are here. The only exception * might be the last message without trailing newline. * But it would have the sequence number returned * by "prb_next_reserve_seq() - 1". */ if (panic_on_this_cpu() && (!debug_non_panic_cpus || legacy_allow_panic_sync) && ((*seq + 1) < prb_next_reserve_seq(rb))) { (*seq)++; } else { return false; } } } return true; } /** * prb_read_valid() - Non-blocking read of a requested record or (if gone) * the next available record. * * @rb: The ringbuffer to read from. * @seq: The sequence number of the record to read. * @r: A record data buffer to store the read record to. * * This is the public function available to readers to read a record. * * The reader provides the @info and @text_buf buffers of @r to be * filled in. Any of the buffer pointers can be set to NULL if the reader * is not interested in that data. To ensure proper initialization of @r, * prb_rec_init_rd() should be used. * * Context: Any context. * Return: true if a record was read, otherwise false. * * On success, the reader must check r->info.seq to see which record was * actually read. This allows the reader to detect dropped records. * * Failure means @seq refers to a record not yet available to the reader. */ bool prb_read_valid(struct printk_ringbuffer *rb, u64 seq, struct printk_record *r) { return _prb_read_valid(rb, &seq, r, NULL); } EXPORT_SYMBOL_IF_KUNIT(prb_read_valid); /** * prb_read_valid_info() - Non-blocking read of meta data for a requested * record or (if gone) the next available record. * * @rb: The ringbuffer to read from. * @seq: The sequence number of the record to read. * @info: A buffer to store the read record meta data to. * @line_count: A buffer to store the number of lines in the record text. * * This is the public function available to readers to read only the * meta data of a record. * * The reader provides the @info, @line_count buffers to be filled in. * Either of the buffer pointers can be set to NULL if the reader is not * interested in that data. * * Context: Any context. * Return: true if a record's meta data was read, otherwise false. * * On success, the reader must check info->seq to see which record meta data * was actually read. This allows the reader to detect dropped records. * * Failure means @seq refers to a record not yet available to the reader. */ bool prb_read_valid_info(struct printk_ringbuffer *rb, u64 seq, struct printk_info *info, unsigned int *line_count) { struct printk_record r; prb_rec_init_rd(&r, info, NULL, 0); return _prb_read_valid(rb, &seq, &r, line_count); } /** * prb_first_valid_seq() - Get the sequence number of the oldest available * record. * * @rb: The ringbuffer to get the sequence number from. * * This is the public function available to readers to see what the * first/oldest valid sequence number is. * * This provides readers a starting point to begin iterating the ringbuffer. * * Context: Any context. * Return: The sequence number of the first/oldest record or, if the * ringbuffer is empty, 0 is returned. */ u64 prb_first_valid_seq(struct printk_ringbuffer *rb) { u64 seq = 0; if (!_prb_read_valid(rb, &seq, NULL, NULL)) return 0; return seq; } /** * prb_next_seq() - Get the sequence number after the last available record. * * @rb: The ringbuffer to get the sequence number from. * * This is the public function available to readers to see what the next * newest sequence number available to readers will be. * * This provides readers a sequence number to jump to if all currently * available records should be skipped. It is guaranteed that all records * previous to the returned value have been finalized and are (or were) * available to the reader. * * Context: Any context. * Return: The sequence number of the next newest (not yet available) record * for readers. */ u64 prb_next_seq(struct printk_ringbuffer *rb) { u64 seq; seq = desc_last_finalized_seq(rb); /* * Begin searching after the last finalized record. * * On 0, the search must begin at 0 because of hack#2 * of the bootstrapping phase it is not known if a * record at index 0 exists. */ if (seq != 0) seq++; /* * The information about the last finalized @seq might be inaccurate. * Search forward to find the current one. */ while (_prb_read_valid(rb, &seq, NULL, NULL)) seq++; return seq; } /** * prb_init() - Initialize a ringbuffer to use provided external buffers. * * @rb: The ringbuffer to initialize. * @text_buf: The data buffer for text data. * @textbits: The size of @text_buf as a power-of-2 value. * @descs: The descriptor buffer for ringbuffer records. * @descbits: The count of @descs items as a power-of-2 value. * @infos: The printk_info buffer for ringbuffer records. * * This is the public function available to writers to setup a ringbuffer * during runtime using provided buffers. * * This must match the initialization of DEFINE_PRINTKRB(). * * Context: Any context. */ void prb_init(struct printk_ringbuffer *rb, char *text_buf, unsigned int textbits, struct prb_desc *descs, unsigned int descbits, struct printk_info *infos) { memset(descs, 0, _DESCS_COUNT(descbits) * sizeof(descs[0])); memset(infos, 0, _DESCS_COUNT(descbits) * sizeof(infos[0])); rb->desc_ring.count_bits = descbits; rb->desc_ring.descs = descs; rb->desc_ring.infos = infos; atomic_long_set(&rb->desc_ring.head_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.tail_id, DESC0_ID(descbits)); atomic_long_set(&rb->desc_ring.last_finalized_seq, 0); rb->text_data_ring.size_bits = textbits; rb->text_data_ring.data = text_buf; atomic_long_set(&rb->text_data_ring.head_lpos, BLK0_LPOS(textbits)); atomic_long_set(&rb->text_data_ring.tail_lpos, BLK0_LPOS(textbits)); atomic_long_set(&rb->fail, 0); atomic_long_set(&(descs[_DESCS_COUNT(descbits) - 1].state_var), DESC0_SV(descbits)); descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.begin = FAILED_LPOS; descs[_DESCS_COUNT(descbits) - 1].text_blk_lpos.next = FAILED_LPOS; infos[0].seq = -(u64)_DESCS_COUNT(descbits); infos[_DESCS_COUNT(descbits) - 1].seq = 0; } EXPORT_SYMBOL_IF_KUNIT(prb_init); /** * prb_record_text_space() - Query the full actual used ringbuffer space for * the text data of a reserved entry. * * @e: The successfully reserved entry to query. * * This is the public function available to writers to see how much actual * space is used in the ringbuffer to store the text data of the specified * entry. * * This function is only valid if @e has been successfully reserved using * prb_reserve(). * * Context: Any context. * Return: The size in bytes used by the text data of the associated record. */ unsigned int prb_record_text_space(struct prb_reserved_entry *e) { return e->text_space; }
13417 8101 13351 13355 13355 8101 13332 13332 13317 179 179 179 113 83 179 851 852 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * * Provides a framework for enqueueing and running callbacks from hardirq * context. The enqueueing is NMI-safe. */ #include <linux/bug.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/irq_work.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/irqflags.h> #include <linux/sched.h> #include <linux/tick.h> #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/smp.h> #include <linux/smpboot.h> #include <asm/processor.h> #include <linux/kasan.h> #include <trace/events/ipi.h> static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); static DEFINE_PER_CPU(struct task_struct *, irq_workd); static void wake_irq_workd(void) { struct task_struct *tsk = __this_cpu_read(irq_workd); if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) wake_up_process(tsk); } #ifdef CONFIG_SMP static void irq_work_wake(struct irq_work *entry) { wake_irq_workd(); } static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = IRQ_WORK_INIT_HARD(irq_work_wake); #endif static int irq_workd_should_run(unsigned int cpu) { return !llist_empty(this_cpu_ptr(&lazy_list)); } /* * Claim the entry so that no one else will poke at it. */ static bool irq_work_claim(struct irq_work *work) { int oflags; oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags); /* * If the work is already pending, no need to raise the IPI. * The pairing smp_mb() in irq_work_single() makes sure * everything we did before is visible. */ if (oflags & IRQ_WORK_PENDING) return false; return true; } void __weak arch_irq_work_raise(void) { /* * Lame architectures will get the timer tick callback */ } static __always_inline void irq_work_raise(struct irq_work *work) { if (trace_ipi_send_cpu_enabled() && arch_irq_work_has_interrupt()) trace_ipi_send_cpu(smp_processor_id(), _RET_IP_, work->func); arch_irq_work_raise(); } /* Enqueue on current CPU, work must already be claimed and preempt disabled */ static void __irq_work_queue_local(struct irq_work *work) { struct llist_head *list; bool rt_lazy_work = false; bool lazy_work = false; int work_flags; work_flags = atomic_read(&work->node.a_flags); if (work_flags & IRQ_WORK_LAZY) lazy_work = true; else if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(work_flags & IRQ_WORK_HARD_IRQ)) rt_lazy_work = true; if (lazy_work || rt_lazy_work) list = this_cpu_ptr(&lazy_list); else list = this_cpu_ptr(&raised_list); if (!llist_add(&work->node.llist, list)) return; /* If the work is "lazy", handle it from next tick if any */ if (!lazy_work || tick_nohz_tick_stopped()) irq_work_raise(work); } /* Enqueue the irq work @work on the current CPU */ bool irq_work_queue(struct irq_work *work) { /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; /* Queue the entry and raise the IPI if needed. */ preempt_disable(); __irq_work_queue_local(work); preempt_enable(); return true; } EXPORT_SYMBOL_GPL(irq_work_queue); /* * Enqueue the irq_work @work on @cpu unless it's already pending * somewhere. * * Can be re-enqueued while the callback is still in progress. */ bool irq_work_queue_on(struct irq_work *work, int cpu) { #ifndef CONFIG_SMP return irq_work_queue(work); #else /* CONFIG_SMP: */ /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(cpu)); /* Only queue if not already pending */ if (!irq_work_claim(work)) return false; kasan_record_aux_stack(work); preempt_disable(); if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); /* * On PREEMPT_RT the items which are not marked as * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work * item is used on the remote CPU to wake the thread. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) goto out; work = &per_cpu(irq_work_wakeup, cpu); if (!irq_work_claim(work)) goto out; } __smp_call_single_queue(cpu, &work->node.llist); } else { __irq_work_queue_local(work); } out: preempt_enable(); return true; #endif /* CONFIG_SMP */ } bool irq_work_needs_cpu(void) { struct llist_head *raised, *lazy; raised = this_cpu_ptr(&raised_list); lazy = this_cpu_ptr(&lazy_list); if (llist_empty(raised) || arch_irq_work_has_interrupt()) if (llist_empty(lazy)) return false; /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); return true; } void irq_work_single(void *arg) { struct irq_work *work = arg; int flags; /* * Clear the PENDING bit, after this point the @work can be re-used. * The PENDING bit acts as a lock, and we own it, so we can clear it * without atomic ops. */ flags = atomic_read(&work->node.a_flags); flags &= ~IRQ_WORK_PENDING; atomic_set(&work->node.a_flags, flags); /* * See irq_work_claim(). */ smp_mb(); lockdep_irq_work_enter(flags); work->func(work); lockdep_irq_work_exit(flags); /* * Clear the BUSY bit, if set, and return to the free state if no-one * else claimed it meanwhile. */ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || !arch_irq_work_has_interrupt()) rcuwait_wake_up(&work->irqwait); } static void irq_work_run_list(struct llist_head *list) { struct irq_work *work, *tmp; struct llist_node *llnode; /* * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed * in a per-CPU thread in preemptible context. Only the items which are * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. */ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); if (llist_empty(list)) return; llnode = llist_del_all(list); llist_for_each_entry_safe(work, tmp, llnode, node.llist) irq_work_single(work); } /* * hotplug calls this through: * hotplug_cfd() -> flush_smp_call_function_queue() */ void irq_work_run(void) { irq_work_run_list(this_cpu_ptr(&raised_list)); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) irq_work_run_list(this_cpu_ptr(&lazy_list)); else wake_irq_workd(); } EXPORT_SYMBOL_GPL(irq_work_run); void irq_work_tick(void) { struct llist_head *raised = this_cpu_ptr(&raised_list); if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) irq_work_run_list(this_cpu_ptr(&lazy_list)); else wake_irq_workd(); } /* * Synchronize against the irq_work @entry, ensures the entry is not * currently in use. */ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); might_sleep(); if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || !arch_irq_work_has_interrupt()) { rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), TASK_UNINTERRUPTIBLE); return; } while (irq_work_is_busy(work)) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); static void run_irq_workd(unsigned int cpu) { irq_work_run_list(this_cpu_ptr(&lazy_list)); } static void irq_workd_setup(unsigned int cpu) { sched_set_fifo_low(current); } static struct smp_hotplug_thread irqwork_threads = { .store = &irq_workd, .setup = irq_workd_setup, .thread_should_run = irq_workd_should_run, .thread_fn = run_irq_workd, .thread_comm = "irq_work/%u", }; static __init int irq_work_init_threads(void) { if (IS_ENABLED(CONFIG_PREEMPT_RT)) BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); return 0; } early_initcall(irq_work_init_threads);
2 566 265 254 65 30 70 52 45 17 70 43 67 20 652 1 651 79 95 46 90 569 1 568 566 2 671 1 5 3 1 2 638 24 95 569 49 651 33 10 23 372 344 1 371 370 372 372 371 371 371 371 371 371 373 1 372 371 372 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> #include <linux/io_uring_types.h> #include <asm/shmparam.h> #include "memmap.h" #include "kbuf.h" #include "rsrc.h" #include "zcrx.h" static void *io_mem_alloc_compound(struct page **pages, int nr_pages, size_t size, gfp_t gfp) { struct page *page; int i, order; order = get_order(size); if (order > MAX_PAGE_ORDER) return ERR_PTR(-ENOMEM); else if (order) gfp |= __GFP_COMP; page = alloc_pages(gfp, order); if (!page) return ERR_PTR(-ENOMEM); for (i = 0; i < nr_pages; i++) pages[i] = page + i; return page_address(page); } struct page **io_pin_pages(unsigned long uaddr, unsigned long len, int *npages) { unsigned long start, end, nr_pages; struct page **pages; int ret; if (check_add_overflow(uaddr, len, &end)) return ERR_PTR(-EOVERFLOW); if (check_add_overflow(end, PAGE_SIZE - 1, &end)) return ERR_PTR(-EOVERFLOW); end = end >> PAGE_SHIFT; start = uaddr >> PAGE_SHIFT; nr_pages = end - start; if (WARN_ON_ONCE(!nr_pages)) return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(nr_pages > INT_MAX)) return ERR_PTR(-EOVERFLOW); pages = kvmalloc_array(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) return ERR_PTR(-ENOMEM); ret = pin_user_pages_fast(uaddr, nr_pages, FOLL_WRITE | FOLL_LONGTERM, pages); /* success, mapped all pages */ if (ret == nr_pages) { *npages = nr_pages; return pages; } /* partial map, or didn't map anything */ if (ret >= 0) { /* if we did partial map, release any pages we did get */ if (ret) unpin_user_pages(pages, ret); ret = -EFAULT; } kvfree(pages); return ERR_PTR(ret); } enum { /* memory was vmap'ed for the kernel, freeing the region vunmap's it */ IO_REGION_F_VMAP = 1, /* memory is provided by user and pinned by the kernel */ IO_REGION_F_USER_PROVIDED = 2, /* only the first page in the array is ref'ed */ IO_REGION_F_SINGLE_REF = 4, }; void io_free_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { if (mr->pages) { long nr_refs = mr->nr_pages; if (mr->flags & IO_REGION_F_SINGLE_REF) nr_refs = 1; if (mr->flags & IO_REGION_F_USER_PROVIDED) unpin_user_pages(mr->pages, nr_refs); else release_pages(mr->pages, nr_refs); kvfree(mr->pages); } if ((mr->flags & IO_REGION_F_VMAP) && mr->ptr) vunmap(mr->ptr); if (mr->nr_pages && ctx->user) __io_unaccount_mem(ctx->user, mr->nr_pages); memset(mr, 0, sizeof(*mr)); } static int io_region_init_ptr(struct io_mapped_region *mr) { struct io_imu_folio_data ifd; void *ptr; if (io_check_coalesce_buffer(mr->pages, mr->nr_pages, &ifd)) { if (ifd.nr_folios == 1 && !PageHighMem(mr->pages[0])) { mr->ptr = page_address(mr->pages[0]); return 0; } } ptr = vmap(mr->pages, mr->nr_pages, VM_MAP, PAGE_KERNEL); if (!ptr) return -ENOMEM; mr->ptr = ptr; mr->flags |= IO_REGION_F_VMAP; return 0; } static int io_region_pin_pages(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg) { unsigned long size = mr->nr_pages << PAGE_SHIFT; struct page **pages; int nr_pages; pages = io_pin_pages(reg->user_addr, size, &nr_pages); if (IS_ERR(pages)) return PTR_ERR(pages); if (WARN_ON_ONCE(nr_pages != mr->nr_pages)) return -EFAULT; mr->pages = pages; mr->flags |= IO_REGION_F_USER_PROVIDED; return 0; } static int io_region_allocate_pages(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset) { gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_ZERO | __GFP_NOWARN; size_t size = (size_t) mr->nr_pages << PAGE_SHIFT; unsigned long nr_allocated; struct page **pages; void *p; pages = kvmalloc_array(mr->nr_pages, sizeof(*pages), gfp); if (!pages) return -ENOMEM; p = io_mem_alloc_compound(pages, mr->nr_pages, size, gfp); if (!IS_ERR(p)) { mr->flags |= IO_REGION_F_SINGLE_REF; goto done; } nr_allocated = alloc_pages_bulk_node(gfp, NUMA_NO_NODE, mr->nr_pages, pages); if (nr_allocated != mr->nr_pages) { if (nr_allocated) release_pages(pages, nr_allocated); kvfree(pages); return -ENOMEM; } done: reg->mmap_offset = mmap_offset; mr->pages = pages; return 0; } int io_create_region(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset) { int nr_pages, ret; u64 end; if (WARN_ON_ONCE(mr->pages || mr->ptr || mr->nr_pages)) return -EFAULT; if (memchr_inv(&reg->__resv, 0, sizeof(reg->__resv))) return -EINVAL; if (reg->flags & ~IORING_MEM_REGION_TYPE_USER) return -EINVAL; /* user_addr should be set IFF it's a user memory backed region */ if ((reg->flags & IORING_MEM_REGION_TYPE_USER) != !!reg->user_addr) return -EFAULT; if (!reg->size || reg->mmap_offset || reg->id) return -EINVAL; if ((reg->size >> PAGE_SHIFT) > INT_MAX) return -E2BIG; if ((reg->user_addr | reg->size) & ~PAGE_MASK) return -EINVAL; if (check_add_overflow(reg->user_addr, reg->size, &end)) return -EOVERFLOW; nr_pages = reg->size >> PAGE_SHIFT; if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; } mr->nr_pages = nr_pages; if (reg->flags & IORING_MEM_REGION_TYPE_USER) ret = io_region_pin_pages(ctx, mr, reg); else ret = io_region_allocate_pages(ctx, mr, reg, mmap_offset); if (ret) goto out_free; ret = io_region_init_ptr(mr); if (ret) goto out_free; return 0; out_free: io_free_region(ctx, mr); return ret; } int io_create_region_mmap_safe(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct io_uring_region_desc *reg, unsigned long mmap_offset) { struct io_mapped_region tmp_mr; int ret; memcpy(&tmp_mr, mr, sizeof(tmp_mr)); ret = io_create_region(ctx, &tmp_mr, reg, mmap_offset); if (ret) return ret; /* * Once published mmap can find it without holding only the ->mmap_lock * and not ->uring_lock. */ guard(mutex)(&ctx->mmap_lock); memcpy(mr, &tmp_mr, sizeof(tmp_mr)); return 0; } static struct io_mapped_region *io_mmap_get_region(struct io_ring_ctx *ctx, loff_t pgoff) { loff_t offset = pgoff << PAGE_SHIFT; unsigned int id; switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: return &ctx->ring_region; case IORING_OFF_SQES: return &ctx->sq_region; case IORING_OFF_PBUF_RING: id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_PBUF_SHIFT; return io_pbuf_get_region(ctx, id); case IORING_MAP_OFF_PARAM_REGION: return &ctx->param_region; case IORING_MAP_OFF_ZCRX_REGION: id = (offset & ~IORING_OFF_MMAP_MASK) >> IORING_OFF_ZCRX_SHIFT; return io_zcrx_get_region(ctx, id); } return NULL; } static void *io_region_validate_mmap(struct io_ring_ctx *ctx, struct io_mapped_region *mr) { lockdep_assert_held(&ctx->mmap_lock); if (!io_region_is_set(mr)) return ERR_PTR(-EINVAL); if (mr->flags & IO_REGION_F_USER_PROVIDED) return ERR_PTR(-EINVAL); return io_region_get_ptr(mr); } static void *io_uring_validate_mmap_request(struct file *file, loff_t pgoff, size_t sz) { struct io_ring_ctx *ctx = file->private_data; struct io_mapped_region *region; region = io_mmap_get_region(ctx, pgoff); if (!region) return ERR_PTR(-EINVAL); return io_region_validate_mmap(ctx, region); } #ifdef CONFIG_MMU static int io_region_mmap(struct io_ring_ctx *ctx, struct io_mapped_region *mr, struct vm_area_struct *vma, unsigned max_pages) { unsigned long nr_pages = min(mr->nr_pages, max_pages); vm_flags_set(vma, VM_DONTEXPAND); return vm_insert_pages(vma, vma->vm_start, mr->pages, &nr_pages); } __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { struct io_ring_ctx *ctx = file->private_data; size_t sz = vma->vm_end - vma->vm_start; long offset = vma->vm_pgoff << PAGE_SHIFT; unsigned int page_limit = UINT_MAX; struct io_mapped_region *region; void *ptr; guard(mutex)(&ctx->mmap_lock); ptr = io_uring_validate_mmap_request(file, vma->vm_pgoff, sz); if (IS_ERR(ptr)) return PTR_ERR(ptr); switch (offset & IORING_OFF_MMAP_MASK) { case IORING_OFF_SQ_RING: case IORING_OFF_CQ_RING: page_limit = (sz + PAGE_SIZE - 1) >> PAGE_SHIFT; break; } region = io_mmap_get_region(ctx, vma->vm_pgoff); return io_region_mmap(ctx, region, vma, page_limit); } unsigned long io_uring_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct io_ring_ctx *ctx = filp->private_data; void *ptr; /* * Do not allow to map to user-provided address to avoid breaking the * aliasing rules. Userspace is not able to guess the offset address of * kernel kmalloc()ed memory area. */ if (addr) return -EINVAL; guard(mutex)(&ctx->mmap_lock); ptr = io_uring_validate_mmap_request(filp, pgoff, len); if (IS_ERR(ptr)) return -ENOMEM; /* * Some architectures have strong cache aliasing requirements. * For such architectures we need a coherent mapping which aliases * kernel memory *and* userspace memory. To achieve that: * - use a NULL file pointer to reference physical memory, and * - use the kernel virtual address of the shared io_uring context * (instead of the userspace-provided address, which has to be 0UL * anyway). * - use the same pgoff which the get_unmapped_area() uses to * calculate the page colouring. * For architectures without such aliasing requirements, the * architecture will return any suitable mapping because addr is 0. */ filp = NULL; flags |= MAP_SHARED; pgoff = 0; /* has been translated to ptr above */ #ifdef SHM_COLOUR addr = (uintptr_t) ptr; pgoff = addr >> PAGE_SHIFT; #else addr = 0UL; #endif return mm_get_unmapped_area(current->mm, filp, addr, len, pgoff, flags); } #else /* !CONFIG_MMU */ int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { return is_nommu_shared_mapping(vma->vm_flags) ? 0 : -EINVAL; } unsigned int io_uring_nommu_mmap_capabilities(struct file *file) { return NOMMU_MAP_DIRECT | NOMMU_MAP_READ | NOMMU_MAP_WRITE; } unsigned long io_uring_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct io_ring_ctx *ctx = file->private_data; void *ptr; guard(mutex)(&ctx->mmap_lock); ptr = io_uring_validate_mmap_request(file, pgoff, len); if (IS_ERR(ptr)) return PTR_ERR(ptr); return (unsigned long) ptr; } #endif /* !CONFIG_MMU */
6475 1 2 3 4 5 6 7 8 // SPDX-License-Identifier: GPL-2.0 #include <linux/static_call.h> long __static_call_return0(void) { return 0; } EXPORT_SYMBOL_GPL(__static_call_return0);
6 6 4 4 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 // SPDX-License-Identifier: GPL-2.0-or-later /* drivers/net/ifb.c: The purpose of this driver is to provide a device that allows for sharing of resources: 1) qdiscs/policies that are per device as opposed to system wide. ifb allows for a device which can be redirected to thus providing an impression of sharing. 2) Allows for queueing incoming traffic for shaping instead of dropping. The original concept is based on what is known as the IMQ driver initially written by Martin Devera, later rewritten by Patrick McHardy and then maintained by Andre Correa. You need the tc action mirror or redirect to feed this device packets. Authors: Jamal Hadi Salim (2005) */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/ethtool.h> #include <linux/etherdevice.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/moduleparam.h> #include <linux/netfilter_netdev.h> #include <net/pkt_sched.h> #include <net/net_namespace.h> #define TX_Q_LIMIT 32 struct ifb_q_stats { u64 packets; u64 bytes; struct u64_stats_sync sync; }; struct ifb_q_private { struct net_device *dev; struct tasklet_struct ifb_tasklet; int tasklet_pending; int txqnum; struct sk_buff_head rq; struct sk_buff_head tq; struct ifb_q_stats rx_stats; struct ifb_q_stats tx_stats; } ____cacheline_aligned_in_smp; struct ifb_dev_private { struct ifb_q_private *tx_private; }; /* For ethtools stats. */ struct ifb_q_stats_desc { char desc[ETH_GSTRING_LEN]; size_t offset; }; #define IFB_Q_STAT(m) offsetof(struct ifb_q_stats, m) static const struct ifb_q_stats_desc ifb_q_stats_desc[] = { { "packets", IFB_Q_STAT(packets) }, { "bytes", IFB_Q_STAT(bytes) }, }; #define IFB_Q_STATS_LEN ARRAY_SIZE(ifb_q_stats_desc) static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev); static int ifb_open(struct net_device *dev); static int ifb_close(struct net_device *dev); static void ifb_update_q_stats(struct ifb_q_stats *stats, int len) { u64_stats_update_begin(&stats->sync); stats->packets++; stats->bytes += len; u64_stats_update_end(&stats->sync); } static void ifb_ri_tasklet(struct tasklet_struct *t) { struct ifb_q_private *txp = from_tasklet(txp, t, ifb_tasklet); struct netdev_queue *txq; struct sk_buff *skb; txq = netdev_get_tx_queue(txp->dev, txp->txqnum); skb = skb_peek(&txp->tq); if (!skb) { if (!__netif_tx_trylock(txq)) goto resched; skb_queue_splice_tail_init(&txp->rq, &txp->tq); __netif_tx_unlock(txq); } while ((skb = __skb_dequeue(&txp->tq)) != NULL) { /* Skip tc and netfilter to prevent redirection loop. */ skb->redirected = 0; #ifdef CONFIG_NET_CLS_ACT skb->tc_skip_classify = 1; #endif nf_skip_egress(skb, true); ifb_update_q_stats(&txp->tx_stats, skb->len); rcu_read_lock(); skb->dev = dev_get_by_index_rcu(dev_net(txp->dev), skb->skb_iif); if (!skb->dev) { rcu_read_unlock(); dev_kfree_skb(skb); txp->dev->stats.tx_dropped++; if (skb_queue_len(&txp->tq) != 0) goto resched; break; } rcu_read_unlock(); skb->skb_iif = txp->dev->ifindex; if (!skb->from_ingress) { dev_queue_xmit(skb); } else { skb_pull_rcsum(skb, skb->mac_len); netif_receive_skb(skb); } } if (__netif_tx_trylock(txq)) { skb = skb_peek(&txp->rq); if (!skb) { txp->tasklet_pending = 0; if (netif_tx_queue_stopped(txq)) netif_tx_wake_queue(txq); } else { __netif_tx_unlock(txq); goto resched; } __netif_tx_unlock(txq); } else { resched: txp->tasklet_pending = 1; tasklet_schedule(&txp->ifb_tasklet); } } static void ifb_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { struct ifb_dev_private *dp = netdev_priv(dev); struct ifb_q_private *txp = dp->tx_private; unsigned int start; u64 packets, bytes; int i; for (i = 0; i < dev->num_tx_queues; i++,txp++) { do { start = u64_stats_fetch_begin(&txp->rx_stats.sync); packets = txp->rx_stats.packets; bytes = txp->rx_stats.bytes; } while (u64_stats_fetch_retry(&txp->rx_stats.sync, start)); stats->rx_packets += packets; stats->rx_bytes += bytes; do { start = u64_stats_fetch_begin(&txp->tx_stats.sync); packets = txp->tx_stats.packets; bytes = txp->tx_stats.bytes; } while (u64_stats_fetch_retry(&txp->tx_stats.sync, start)); stats->tx_packets += packets; stats->tx_bytes += bytes; } stats->rx_dropped = dev->stats.rx_dropped; stats->tx_dropped = dev->stats.tx_dropped; } static int ifb_dev_init(struct net_device *dev) { struct ifb_dev_private *dp = netdev_priv(dev); struct ifb_q_private *txp; int i; txp = kcalloc(dev->num_tx_queues, sizeof(*txp), GFP_KERNEL); if (!txp) return -ENOMEM; dp->tx_private = txp; for (i = 0; i < dev->num_tx_queues; i++,txp++) { txp->txqnum = i; txp->dev = dev; __skb_queue_head_init(&txp->rq); __skb_queue_head_init(&txp->tq); u64_stats_init(&txp->rx_stats.sync); u64_stats_init(&txp->tx_stats.sync); tasklet_setup(&txp->ifb_tasklet, ifb_ri_tasklet); netif_tx_start_queue(netdev_get_tx_queue(dev, i)); } return 0; } static void ifb_get_strings(struct net_device *dev, u32 stringset, u8 *buf) { u8 *p = buf; int i, j; switch (stringset) { case ETH_SS_STATS: for (i = 0; i < dev->real_num_rx_queues; i++) for (j = 0; j < IFB_Q_STATS_LEN; j++) ethtool_sprintf(&p, "rx_queue_%u_%.18s", i, ifb_q_stats_desc[j].desc); for (i = 0; i < dev->real_num_tx_queues; i++) for (j = 0; j < IFB_Q_STATS_LEN; j++) ethtool_sprintf(&p, "tx_queue_%u_%.18s", i, ifb_q_stats_desc[j].desc); break; } } static int ifb_get_sset_count(struct net_device *dev, int sset) { switch (sset) { case ETH_SS_STATS: return IFB_Q_STATS_LEN * (dev->real_num_rx_queues + dev->real_num_tx_queues); default: return -EOPNOTSUPP; } } static void ifb_fill_stats_data(u64 **data, struct ifb_q_stats *q_stats) { void *stats_base = (void *)q_stats; unsigned int start; size_t offset; int j; do { start = u64_stats_fetch_begin(&q_stats->sync); for (j = 0; j < IFB_Q_STATS_LEN; j++) { offset = ifb_q_stats_desc[j].offset; (*data)[j] = *(u64 *)(stats_base + offset); } } while (u64_stats_fetch_retry(&q_stats->sync, start)); *data += IFB_Q_STATS_LEN; } static void ifb_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, u64 *data) { struct ifb_dev_private *dp = netdev_priv(dev); struct ifb_q_private *txp; int i; for (i = 0; i < dev->real_num_rx_queues; i++) { txp = dp->tx_private + i; ifb_fill_stats_data(&data, &txp->rx_stats); } for (i = 0; i < dev->real_num_tx_queues; i++) { txp = dp->tx_private + i; ifb_fill_stats_data(&data, &txp->tx_stats); } } static const struct net_device_ops ifb_netdev_ops = { .ndo_open = ifb_open, .ndo_stop = ifb_close, .ndo_get_stats64 = ifb_stats64, .ndo_start_xmit = ifb_xmit, .ndo_validate_addr = eth_validate_addr, .ndo_init = ifb_dev_init, }; static const struct ethtool_ops ifb_ethtool_ops = { .get_strings = ifb_get_strings, .get_sset_count = ifb_get_sset_count, .get_ethtool_stats = ifb_get_ethtool_stats, }; #define IFB_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_FRAGLIST | \ NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL | \ NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX | \ NETIF_F_HW_VLAN_STAG_TX) static void ifb_dev_free(struct net_device *dev) { struct ifb_dev_private *dp = netdev_priv(dev); struct ifb_q_private *txp = dp->tx_private; int i; for (i = 0; i < dev->num_tx_queues; i++,txp++) { tasklet_kill(&txp->ifb_tasklet); __skb_queue_purge(&txp->rq); __skb_queue_purge(&txp->tq); } kfree(dp->tx_private); } static void ifb_setup(struct net_device *dev) { /* Initialize the device structure. */ dev->netdev_ops = &ifb_netdev_ops; dev->ethtool_ops = &ifb_ethtool_ops; /* Fill in device structure with ethernet-generic values. */ ether_setup(dev); dev->tx_queue_len = TX_Q_LIMIT; dev->features |= IFB_FEATURES; dev->hw_features |= dev->features; dev->hw_enc_features |= dev->features; dev->vlan_features |= IFB_FEATURES & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); dev->flags |= IFF_NOARP; dev->flags &= ~IFF_MULTICAST; dev->priv_flags &= ~IFF_TX_SKB_SHARING; netif_keep_dst(dev); eth_hw_addr_random(dev); dev->needs_free_netdev = true; dev->priv_destructor = ifb_dev_free; dev->min_mtu = 0; dev->max_mtu = 0; netif_set_tso_max_size(dev, GSO_MAX_SIZE); } static netdev_tx_t ifb_xmit(struct sk_buff *skb, struct net_device *dev) { struct ifb_dev_private *dp = netdev_priv(dev); struct ifb_q_private *txp = dp->tx_private + skb_get_queue_mapping(skb); ifb_update_q_stats(&txp->rx_stats, skb->len); if (!skb->redirected || !skb->skb_iif) { dev_kfree_skb(skb); dev->stats.rx_dropped++; return NETDEV_TX_OK; } if (skb_queue_len(&txp->rq) >= dev->tx_queue_len) netif_tx_stop_queue(netdev_get_tx_queue(dev, txp->txqnum)); __skb_queue_tail(&txp->rq, skb); if (!txp->tasklet_pending) { txp->tasklet_pending = 1; tasklet_schedule(&txp->ifb_tasklet); } return NETDEV_TX_OK; } static int ifb_close(struct net_device *dev) { netif_tx_stop_all_queues(dev); return 0; } static int ifb_open(struct net_device *dev) { netif_tx_start_all_queues(dev); return 0; } static int ifb_validate(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { if (tb[IFLA_ADDRESS]) { if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) return -EINVAL; if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) return -EADDRNOTAVAIL; } return 0; } static struct rtnl_link_ops ifb_link_ops __read_mostly = { .kind = "ifb", .priv_size = sizeof(struct ifb_dev_private), .setup = ifb_setup, .validate = ifb_validate, }; /* Number of ifb devices to be set up by this module. * Note that these legacy devices have one queue. * Prefer something like : ip link add ifb10 numtxqueues 8 type ifb */ static int numifbs = 2; module_param(numifbs, int, 0); MODULE_PARM_DESC(numifbs, "Number of ifb devices"); static int __init ifb_init_one(int index) { struct net_device *dev_ifb; int err; dev_ifb = alloc_netdev(sizeof(struct ifb_dev_private), "ifb%d", NET_NAME_UNKNOWN, ifb_setup); if (!dev_ifb) return -ENOMEM; dev_ifb->rtnl_link_ops = &ifb_link_ops; err = register_netdevice(dev_ifb); if (err < 0) goto err; return 0; err: free_netdev(dev_ifb); return err; } static int __init ifb_init_module(void) { int i, err; err = rtnl_link_register(&ifb_link_ops); if (err < 0) return err; rtnl_net_lock(&init_net); for (i = 0; i < numifbs && !err; i++) { err = ifb_init_one(i); cond_resched(); } rtnl_net_unlock(&init_net); if (err) rtnl_link_unregister(&ifb_link_ops); return err; } static void __exit ifb_cleanup_module(void) { rtnl_link_unregister(&ifb_link_ops); } module_init(ifb_init_module); module_exit(ifb_cleanup_module); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Intermediate Functional Block (ifb) netdevice driver for sharing of resources and ingress packet queuing"); MODULE_AUTHOR("Jamal Hadi Salim"); MODULE_ALIAS_RTNL_LINK("ifb");
1286 24 1087 1087 219 234 342 48 48 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 /* SPDX-License-Identifier: GPL-2.0-only */ /* * net busy poll support * Copyright(c) 2013 Intel Corporation. * * Author: Eliezer Tamir * * Contact Information: * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net> */ #ifndef _LINUX_NET_BUSY_POLL_H #define _LINUX_NET_BUSY_POLL_H #include <linux/netdevice.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <net/ip.h> #include <net/xdp.h> /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu * NR_CPUS+1..~0 - Region available for NAPI IDs */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) static inline bool napi_id_valid(unsigned int napi_id) { return napi_id >= MIN_NAPI_ID; } #define BUSY_POLL_BUDGET 8 #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; extern unsigned int sysctl_net_busy_read __read_mostly; extern unsigned int sysctl_net_busy_poll __read_mostly; static inline bool net_busy_loop_on(void) { return READ_ONCE(sysctl_net_busy_poll); } static inline bool sk_can_busy_loop(const struct sock *sk) { return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current); } bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); void napi_busy_loop_rcu(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg, bool prefer_busy_poll, u16 budget); void napi_suspend_irqs(unsigned int napi_id); void napi_resume_irqs(unsigned int napi_id); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { return 0; } static inline bool sk_can_busy_loop(struct sock *sk) { return false; } #endif /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long busy_loop_current_time(void) { #ifdef CONFIG_NET_RX_BUSY_POLL return (unsigned long)(ktime_get_ns() >> 10); #else return 0; #endif } /* in poll/select we use the global sysctl_net_ll_poll value */ static inline bool busy_loop_timeout(unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline bool sk_busy_loop_timeout(struct sock *sk, unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline void sk_busy_loop(struct sock *sk, int nonblock) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id_valid(napi_id)) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, READ_ONCE(sk->sk_prefer_busy_poll), READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET); #endif } /* used in the NIC receive handler to mark the skb */ static inline void __skb_mark_napi_id(struct sk_buff *skb, const struct gro_node *gro) { #ifdef CONFIG_NET_RX_BUSY_POLL /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ if (!napi_id_valid(skb->napi_id)) skb->napi_id = gro->cached_napi_id; #endif } static inline void skb_mark_napi_id(struct sk_buff *skb, const struct napi_struct *napi) { __skb_mark_napi_id(skb, &napi->gro); } /* used in the protocol handler to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL if (unlikely(READ_ONCE(sk->sk_napi_id) != skb->napi_id)) WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_update(sk, skb); } /* Variant of sk_mark_napi_id() for passive flow setup, * as sk->sk_napi_id and sk->sk_rx_queue_mapping content * needs to be set. */ static inline void sk_mark_napi_id_set(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_set(sk, skb); } static inline void __sk_mark_napi_id_once(struct sock *sk, unsigned int napi_id) { #ifdef CONFIG_NET_RX_BUSY_POLL if (!READ_ONCE(sk->sk_napi_id)) WRITE_ONCE(sk->sk_napi_id, napi_id); #endif } /* variant used for unconnected sockets */ static inline void sk_mark_napi_id_once(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL __sk_mark_napi_id_once(sk, skb->napi_id); #endif } #endif /* _LINUX_NET_BUSY_POLL_H */
453 453 638 52 193 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CPUSET_H #define _LINUX_CPUSET_H /* * cpuset interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/sched/topology.h> #include <linux/sched/task.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/jump_label.h> #ifdef CONFIG_CPUSETS /* * Static branch rewrites can happen in an arbitrary order for a given * key. In code paths where we need to loop with read_mems_allowed_begin() and * read_mems_allowed_retry() to get a consistent view of mems_allowed, we need * to ensure that begin() always gets rewritten before retry() in the * disabled -> enabled transition. If not, then if local irqs are disabled * around the loop, we can deadlock since retry() would always be * comparing the latest value of the mems_allowed seqcount against 0 as * begin() still would see cpusets_enabled() as false. The enabled -> disabled * transition should happen in reverse order for the same reasons (want to stop * looking at real value of mems_allowed.sequence in retry() first). */ extern struct static_key_false cpusets_pre_enable_key; extern struct static_key_false cpusets_enabled_key; extern struct static_key_false cpusets_insane_config_key; static inline bool cpusets_enabled(void) { return static_branch_unlikely(&cpusets_enabled_key); } static inline void cpuset_inc(void) { static_branch_inc_cpuslocked(&cpusets_pre_enable_key); static_branch_inc_cpuslocked(&cpusets_enabled_key); } static inline void cpuset_dec(void) { static_branch_dec_cpuslocked(&cpusets_enabled_key); static_branch_dec_cpuslocked(&cpusets_pre_enable_key); } /* * This will get enabled whenever a cpuset configuration is considered * unsupportable in general. E.g. movable only node which cannot satisfy * any non movable allocations (see update_nodemask). Page allocator * needs to make additional checks for those configurations and this * check is meant to guard those checks without any overhead for sane * configurations. */ static inline bool cpusets_insane_config(void) { return static_branch_unlikely(&cpusets_insane_config_key); } extern int cpuset_init(void); extern void cpuset_init_smp(void); extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(void); extern void inc_dl_tasks_cs(struct task_struct *task); extern void dec_dl_tasks_cs(struct task_struct *task); extern void cpuset_lock(void); extern void cpuset_unlock(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern bool cpuset_cpus_allowed_fallback(struct task_struct *p); extern bool cpuset_cpu_is_isolated(int cpu); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); #define cpuset_current_mems_allowed (current->mems_allowed) void cpuset_init_current_mems_allowed(void); int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); extern bool cpuset_current_node_allowed(int node, gfp_t gfp_mask); static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return cpuset_current_node_allowed(zone_to_nid(z), gfp_mask); } static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { if (cpusets_enabled()) return __cpuset_zone_allowed(z, gfp_mask); return true; } extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2); #ifdef CONFIG_CPUSETS_V1 #define cpuset_memory_pressure_bump() \ do { \ if (cpuset_memory_pressure_enabled) \ __cpuset_memory_pressure_bump(); \ } while (0) extern int cpuset_memory_pressure_enabled; extern void __cpuset_memory_pressure_bump(void); #else static inline void cpuset_memory_pressure_bump(void) { } #endif extern void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task); extern int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); extern int cpuset_mem_spread_node(void); static inline int cpuset_do_page_mem_spread(void) { return task_spread_page(current); } extern bool current_cpuset_is_being_rebound(void); extern void dl_rebuild_rd_accounting(void); extern void rebuild_sched_domains(void); extern void cpuset_print_current_mems_allowed(void); extern void cpuset_reset_sched_domains(void); /* * read_mems_allowed_begin is required when making decisions involving * mems_allowed such as during page allocation. mems_allowed can be updated in * parallel and depending on the new value an operation can fail potentially * causing process failure. A retry loop with read_mems_allowed_begin and * read_mems_allowed_retry prevents these artificial failures. */ static inline unsigned int read_mems_allowed_begin(void) { if (!static_branch_unlikely(&cpusets_pre_enable_key)) return 0; return read_seqcount_begin(&current->mems_allowed_seq); } /* * If this returns true, the operation that took place after * read_mems_allowed_begin may have failed artificially due to a concurrent * update of mems_allowed. It is up to the caller to retry the operation if * appropriate. */ static inline bool read_mems_allowed_retry(unsigned int seq) { if (!static_branch_unlikely(&cpusets_enabled_key)) return false; return read_seqcount_retry(&current->mems_allowed_seq, seq); } static inline void set_mems_allowed(nodemask_t nodemask) { unsigned long flags; task_lock(current); local_irq_save(flags); write_seqcount_begin(&current->mems_allowed_seq); current->mems_allowed = nodemask; write_seqcount_end(&current->mems_allowed_seq); local_irq_restore(flags); task_unlock(current); } extern bool cpuset_node_allowed(struct cgroup *cgroup, int nid); #else /* !CONFIG_CPUSETS */ static inline bool cpusets_enabled(void) { return false; } static inline bool cpusets_insane_config(void) { return false; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} static inline void cpuset_force_rebuild(void) { } static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } static inline void inc_dl_tasks_cs(struct task_struct *task) { } static inline void dec_dl_tasks_cs(struct task_struct *task) { } static inline void cpuset_lock(void) { } static inline void cpuset_unlock(void) { } static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { cpumask_copy(mask, task_cpu_possible_mask(p)); } static inline bool cpuset_cpus_allowed_fallback(struct task_struct *p) { return false; } static inline bool cpuset_cpu_is_isolated(int cpu) { return false; } static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) { return node_possible_map; } #define cpuset_current_mems_allowed (node_states[N_MEMORY]) static inline void cpuset_init_current_mems_allowed(void) {} static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) { return 1; } static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return true; } static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) { return true; } static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, const struct task_struct *tsk2) { return 1; } static inline void cpuset_memory_pressure_bump(void) {} static inline void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) { } static inline int cpuset_mem_spread_node(void) { return 0; } static inline int cpuset_do_page_mem_spread(void) { return 0; } static inline bool current_cpuset_is_being_rebound(void) { return false; } static inline void dl_rebuild_rd_accounting(void) { } static inline void rebuild_sched_domains(void) { partition_sched_domains(1, NULL, NULL); } static inline void cpuset_reset_sched_domains(void) { partition_sched_domains(1, NULL, NULL); } static inline void cpuset_print_current_mems_allowed(void) { } static inline void set_mems_allowed(nodemask_t nodemask) { } static inline unsigned int read_mems_allowed_begin(void) { return 0; } static inline bool read_mems_allowed_retry(unsigned int seq) { return false; } static inline bool cpuset_node_allowed(struct cgroup *cgroup, int nid) { return true; } #endif /* !CONFIG_CPUSETS */ #endif /* _LINUX_CPUSET_H */
2057 8590 787 787 49 50 1591 1590 2370 2369 2366 2370 1753 36 1753 1754 1753 1753 36 36 36 36 35267 8024 34671 631 630 631 630 631 630 630 631 631 2 2 9024 9012 9022 9037 9017 9013 771 9008 9019 9011 9551 9552 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 // SPDX-License-Identifier: GPL-2.0-only /* Kernel thread helper functions. * Copyright (C) 2004 IBM Corporation, Rusty Russell. * Copyright (C) 2009 Red Hat, Inc. * * Creation is done via kthreadd, so that we get a clean environment * even if we're invoked from userspace (think modprobe, hotplug cpu, * etc.). */ #include <uapi/linux/sched/types.h> #include <linux/mm.h> #include <linux/mmu_context.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> #include <linux/kthread.h> #include <linux/completion.h> #include <linux/err.h> #include <linux/cgroup.h> #include <linux/cpuset.h> #include <linux/unistd.h> #include <linux/file.h> #include <linux/export.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> #include <linux/ptrace.h> #include <linux/uaccess.h> #include <linux/numa.h> #include <linux/sched/isolation.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; static LIST_HEAD(kthreads_hotplug); static DEFINE_MUTEX(kthreads_hotplug_lock); struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ char *full_name; int (*threadfn)(void *data); void *data; int node; /* Result passed back to kthread_create() from kthreadd. */ struct task_struct *result; struct completion *done; struct list_head list; }; struct kthread { unsigned long flags; unsigned int cpu; unsigned int node; int started; int result; int (*threadfn)(void *); void *data; struct completion parked; struct completion exited; #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif /* To store the full name if task comm is truncated. */ char *full_name; struct task_struct *task; struct list_head hotplug_node; struct cpumask *preferred_affinity; }; enum KTHREAD_BITS { KTHREAD_IS_PER_CPU = 0, KTHREAD_SHOULD_STOP, KTHREAD_SHOULD_PARK, }; static inline struct kthread *to_kthread(struct task_struct *k) { WARN_ON(!(k->flags & PF_KTHREAD)); return k->worker_private; } /* * Variant of to_kthread() that doesn't assume @p is a kthread. * * When "(p->flags & PF_KTHREAD)" is set the task is a kthread and will * always remain a kthread. For kthreads p->worker_private always * points to a struct kthread. For tasks that are not kthreads * p->worker_private is used to point to other things. * * Return NULL for any task that is not a kthread. */ static inline struct kthread *__to_kthread(struct task_struct *p) { void *kthread = p->worker_private; if (kthread && !(p->flags & PF_KTHREAD)) kthread = NULL; return kthread; } void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) { struct kthread *kthread = to_kthread(tsk); if (!kthread || !kthread->full_name) { strscpy(buf, tsk->comm, buf_size); return; } strscpy_pad(buf, kthread->full_name, buf_size); } bool set_kthread_struct(struct task_struct *p) { struct kthread *kthread; if (WARN_ON_ONCE(to_kthread(p))) return false; kthread = kzalloc(sizeof(*kthread), GFP_KERNEL); if (!kthread) return false; init_completion(&kthread->exited); init_completion(&kthread->parked); INIT_LIST_HEAD(&kthread->hotplug_node); p->vfork_done = &kthread->exited; kthread->task = p; kthread->node = tsk_fork_get_node(current); p->worker_private = kthread; return true; } void free_kthread_struct(struct task_struct *k) { struct kthread *kthread; /* * Can be NULL if kmalloc() in set_kthread_struct() failed. */ kthread = to_kthread(k); if (!kthread) return; #ifdef CONFIG_BLK_CGROUP WARN_ON_ONCE(kthread->blkcg_css); #endif k->worker_private = NULL; kfree(kthread->full_name); kfree(kthread); } /** * kthread_should_stop - should this kthread return now? * * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ bool kthread_should_stop(void) { return test_bit(KTHREAD_SHOULD_STOP, &to_kthread(current)->flags); } EXPORT_SYMBOL(kthread_should_stop); static bool __kthread_should_park(struct task_struct *k) { return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(k)->flags); } /** * kthread_should_park - should this kthread park now? * * When someone calls kthread_park() on your kthread, it will be woken * and this will return true. You should then do the necessary * cleanup and call kthread_parkme() * * Similar to kthread_should_stop(), but this keeps the thread alive * and in a park position. kthread_unpark() "restarts" the thread and * calls the thread function again. */ bool kthread_should_park(void) { return __kthread_should_park(current); } EXPORT_SYMBOL_GPL(kthread_should_park); bool kthread_should_stop_or_park(void) { struct kthread *kthread = __to_kthread(current); if (!kthread) return false; return kthread->flags & (BIT(KTHREAD_SHOULD_STOP) | BIT(KTHREAD_SHOULD_PARK)); } /** * kthread_freezable_should_stop - should this freezable kthread return now? * @was_frozen: optional out parameter, indicates whether %current was frozen * * kthread_should_stop() for freezable kthreads, which will enter * refrigerator if necessary. This function is safe from kthread_stop() / * freezer deadlock and freezable kthreads should use this function instead * of calling try_to_freeze() directly. */ bool kthread_freezable_should_stop(bool *was_frozen) { bool frozen = false; might_sleep(); if (unlikely(freezing(current))) frozen = __refrigerator(true); if (was_frozen) *was_frozen = frozen; return kthread_should_stop(); } EXPORT_SYMBOL_GPL(kthread_freezable_should_stop); /** * kthread_func - return the function specified on kthread creation * @task: kthread task in question * * Returns NULL if the task is not a kthread. */ void *kthread_func(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); if (kthread) return kthread->threadfn; return NULL; } EXPORT_SYMBOL_GPL(kthread_func); /** * kthread_data - return data value specified on kthread creation * @task: kthread task in question * * Return the data value specified when kthread @task was created. * The caller is responsible for ensuring the validity of @task when * calling this function. */ void *kthread_data(struct task_struct *task) { return to_kthread(task)->data; } EXPORT_SYMBOL_GPL(kthread_data); /** * kthread_probe_data - speculative version of kthread_data() * @task: possible kthread task in question * * @task could be a kthread task. Return the data value specified when it * was created if accessible. If @task isn't a kthread task or its data is * inaccessible for any reason, %NULL is returned. This function requires * that @task itself is safe to dereference. */ void *kthread_probe_data(struct task_struct *task) { struct kthread *kthread = __to_kthread(task); void *data = NULL; if (kthread) copy_from_kernel_nofault(&data, &kthread->data, sizeof(data)); return data; } static void __kthread_parkme(struct kthread *self) { for (;;) { /* * TASK_PARKED is a special state; we must serialize against * possible pending wakeups to avoid store-store collisions on * task->state. * * Such a collision might possibly result in the task state * changin from TASK_PARKED and us failing the * wait_task_inactive() in kthread_park(). */ set_special_state(TASK_PARKED); if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) break; /* * Thread is going to call schedule(), do not preempt it, * or the caller of kthread_park() may spend more time in * wait_task_inactive(). */ preempt_disable(); complete(&self->parked); schedule_preempt_disabled(); preempt_enable(); } __set_current_state(TASK_RUNNING); } void kthread_parkme(void) { __kthread_parkme(to_kthread(current)); } EXPORT_SYMBOL_GPL(kthread_parkme); /** * kthread_exit - Cause the current kthread return @result to kthread_stop(). * @result: The integer value to return to kthread_stop(). * * While kthread_exit can be called directly, it exists so that * functions which do some additional work in non-modular code such as * module_put_and_kthread_exit can be implemented. * * Does not return. */ void __noreturn kthread_exit(long result) { struct kthread *kthread = to_kthread(current); kthread->result = result; if (!list_empty(&kthread->hotplug_node)) { mutex_lock(&kthreads_hotplug_lock); list_del(&kthread->hotplug_node); mutex_unlock(&kthreads_hotplug_lock); if (kthread->preferred_affinity) { kfree(kthread->preferred_affinity); kthread->preferred_affinity = NULL; } } do_exit(0); } EXPORT_SYMBOL(kthread_exit); /** * kthread_complete_and_exit - Exit the current kthread. * @comp: Completion to complete * @code: The integer value to return to kthread_stop(). * * If present, complete @comp and then return code to kthread_stop(). * * A kernel thread whose module may be removed after the completion of * @comp can use this function to exit safely. * * Does not return. */ void __noreturn kthread_complete_and_exit(struct completion *comp, long code) { if (comp) complete(comp); kthread_exit(code); } EXPORT_SYMBOL(kthread_complete_and_exit); static void kthread_fetch_affinity(struct kthread *kthread, struct cpumask *cpumask) { const struct cpumask *pref; if (kthread->preferred_affinity) { pref = kthread->preferred_affinity; } else { if (WARN_ON_ONCE(kthread->node == NUMA_NO_NODE)) return; pref = cpumask_of_node(kthread->node); } cpumask_and(cpumask, pref, housekeeping_cpumask(HK_TYPE_KTHREAD)); if (cpumask_empty(cpumask)) cpumask_copy(cpumask, housekeeping_cpumask(HK_TYPE_KTHREAD)); } static void kthread_affine_node(void) { struct kthread *kthread = to_kthread(current); cpumask_var_t affinity; WARN_ON_ONCE(kthread_is_per_cpu(current)); if (kthread->node == NUMA_NO_NODE) { housekeeping_affine(current, HK_TYPE_KTHREAD); } else { if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) { WARN_ON_ONCE(1); return; } mutex_lock(&kthreads_hotplug_lock); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); /* * The node cpumask is racy when read from kthread() but: * - a racing CPU going down will either fail on the subsequent * call to set_cpus_allowed_ptr() or be migrated to housekeepers * afterwards by the scheduler. * - a racing CPU going up will be handled by kthreads_online_cpu() */ kthread_fetch_affinity(kthread, affinity); set_cpus_allowed_ptr(current, affinity); mutex_unlock(&kthreads_hotplug_lock); free_cpumask_var(affinity); } } static int kthread(void *_create) { static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; void *data = create->data; struct completion *done; struct kthread *self; int ret; self = to_kthread(current); /* Release the structure when caller killed by a fatal signal. */ done = xchg(&create->done, NULL); if (!done) { kfree(create->full_name); kfree(create); kthread_exit(-EINTR); } self->full_name = create->full_name; self->threadfn = threadfn; self->data = data; /* * The new thread inherited kthreadd's priority and CPU mask. Reset * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, &param); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; /* * Thread is going to call schedule(), do not preempt it, * or the creator may spend more time in wait_task_inactive(). */ preempt_disable(); complete(done); schedule_preempt_disabled(); preempt_enable(); self->started = 1; if (!(current->flags & PF_NO_SETAFFINITY) && !self->preferred_affinity) kthread_affine_node(); ret = -EINTR; if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { cgroup_kthread_ready(); __kthread_parkme(self); ret = threadfn(data); } kthread_exit(ret); } /* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA if (tsk == kthreadd_task) return tsk->pref_node_fork; #endif return NUMA_NO_NODE; } static void create_kthread(struct kthread_create_info *create) { int pid; #ifdef CONFIG_NUMA current->pref_node_fork = create->node; #endif /* We want our own signal handler (we take no signals by default). */ pid = kernel_thread(kthread, create, create->full_name, CLONE_FS | CLONE_FILES | SIGCHLD); if (pid < 0) { /* Release the structure when caller killed by a fatal signal. */ struct completion *done = xchg(&create->done, NULL); kfree(create->full_name); if (!done) { kfree(create); return; } create->result = ERR_PTR(pid); complete(done); } } static __printf(4, 0) struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], va_list args) { DECLARE_COMPLETION_ONSTACK(done); struct task_struct *task; struct kthread_create_info *create = kmalloc(sizeof(*create), GFP_KERNEL); if (!create) return ERR_PTR(-ENOMEM); create->threadfn = threadfn; create->data = data; create->node = node; create->done = &done; create->full_name = kvasprintf(GFP_KERNEL, namefmt, args); if (!create->full_name) { task = ERR_PTR(-ENOMEM); goto free_create; } spin_lock(&kthread_create_lock); list_add_tail(&create->list, &kthread_create_list); spin_unlock(&kthread_create_lock); wake_up_process(kthreadd_task); /* * Wait for completion in killable state, for I might be chosen by * the OOM killer while kthreadd is trying to allocate memory for * new kernel thread. */ if (unlikely(wait_for_completion_killable(&done))) { /* * If I was killed by a fatal signal before kthreadd (or new * kernel thread) calls complete(), leave the cleanup of this * structure to that thread. */ if (xchg(&create->done, NULL)) return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. */ wait_for_completion(&done); } task = create->result; free_create: kfree(create); return task; } /** * kthread_create_on_node - create a kthread. * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @node: task and thread structures for the thread are allocated on this node * @namefmt: printf-style name for the thread. * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and * is affine to all CPUs. * * If thread is going to be bound on a particular cpu, give its node * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE. * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either return directly if it is a * standalone thread for which no one will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, const char namefmt[], ...) { struct task_struct *task; va_list args; va_start(args, namefmt); task = __kthread_create_on_node(threadfn, data, node, namefmt, args); va_end(args); return task; } EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { unsigned long flags; if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) { __kthread_bind_mask(p, cpumask_of(cpu), state); } void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } /** * kthread_bind - bind a just-created kthread to a cpu. * @p: thread created by kthread_create(). * @cpu: cpu (might not be online, must be possible) for @k to run on. * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *p, unsigned int cpu) { struct kthread *kthread = to_kthread(p); __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); WARN_ON_ONCE(kthread->started); } EXPORT_SYMBOL(kthread_bind); /** * kthread_create_on_cpu - Create a cpu bound kthread * @threadfn: the function to run until signal_pending(current). * @data: data ptr for @threadfn. * @cpu: The cpu on which the thread should be bound, * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Description: This helper function creates and names a kernel thread */ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), void *data, unsigned int cpu, const char *namefmt) { struct task_struct *p; p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt, cpu); if (IS_ERR(p)) return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ to_kthread(p)->cpu = cpu; return p; } EXPORT_SYMBOL(kthread_create_on_cpu); void kthread_set_per_cpu(struct task_struct *k, int cpu) { struct kthread *kthread = to_kthread(k); if (!kthread) return; WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); if (cpu < 0) { clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); return; } kthread->cpu = cpu; set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } bool kthread_is_per_cpu(struct task_struct *p) { struct kthread *kthread = __to_kthread(p); if (!kthread) return false; return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); } /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return false, wakes it, and * waits for it to return. If the thread is marked percpu then its * bound to the cpu again. */ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (!test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)) return; /* * Newly created kthread was parked when the CPU was offline. * The binding was lost and we need to set it again. */ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); /* * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup. */ wake_up_state(k, TASK_PARKED); } EXPORT_SYMBOL_GPL(kthread_unpark); /** * kthread_park - park a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_park() for @k to return true, wakes it, and * waits for it to return. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will park without * calling threadfn(). * * Returns 0 if the thread is parked, -ENOSYS if the thread exited. * If called by the kthread itself just the park bit is set. */ int kthread_park(struct task_struct *k) { struct kthread *kthread = to_kthread(k); if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) return -EBUSY; set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); /* * Wait for __kthread_parkme() to complete(), this means we * _will_ have TASK_PARKED and are about to call schedule(). */ wait_for_completion(&kthread->parked); /* * Now wait for that schedule() to complete and the task to * get scheduled out. */ WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED)); } return 0; } EXPORT_SYMBOL_GPL(kthread_park); /** * kthread_stop - stop a thread created by kthread_create(). * @k: thread created by kthread_create(). * * Sets kthread_should_stop() for @k to return true, wakes it, and * waits for it to exit. This can also be called after kthread_create() * instead of calling wake_up_process(): the thread will exit without * calling threadfn(). * * If threadfn() may call kthread_exit() itself, the caller must ensure * task_struct can't go away. * * Returns the result of threadfn(), or %-EINTR if wake_up_process() * was never called. */ int kthread_stop(struct task_struct *k) { struct kthread *kthread; int ret; trace_sched_kthread_stop(k); get_task_struct(k); kthread = to_kthread(k); set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); kthread_unpark(k); set_tsk_thread_flag(k, TIF_NOTIFY_SIGNAL); wake_up_process(k); wait_for_completion(&kthread->exited); ret = kthread->result; put_task_struct(k); trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); /** * kthread_stop_put - stop a thread and put its task struct * @k: thread created by kthread_create(). * * Stops a thread created by kthread_create() and put its task_struct. * Only use when holding an extra task struct reference obtained by * calling get_task_struct(). */ int kthread_stop_put(struct task_struct *k) { int ret; ret = kthread_stop(k); put_task_struct(k); return ret; } EXPORT_SYMBOL(kthread_stop_put); int kthreadd(void *unused) { static const char comm[TASK_COMM_LEN] = "kthreadd"; struct task_struct *tsk = current; /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, comm); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE; cgroup_init_kthreadd(); for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (list_empty(&kthread_create_list)) schedule(); __set_current_state(TASK_RUNNING); spin_lock(&kthread_create_lock); while (!list_empty(&kthread_create_list)) { struct kthread_create_info *create; create = list_entry(kthread_create_list.next, struct kthread_create_info, list); list_del_init(&create->list); spin_unlock(&kthread_create_lock); create_kthread(create); spin_lock(&kthread_create_lock); } spin_unlock(&kthread_create_lock); } return 0; } int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; unsigned long flags; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { WARN_ON(1); return -EINVAL; } WARN_ON_ONCE(kthread->preferred_affinity); if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; kthread->preferred_affinity = kzalloc(sizeof(struct cpumask), GFP_KERNEL); if (!kthread->preferred_affinity) { ret = -ENOMEM; goto out; } mutex_lock(&kthreads_hotplug_lock); cpumask_copy(kthread->preferred_affinity, mask); WARN_ON_ONCE(!list_empty(&kthread->hotplug_node)); list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); /* It's safe because the task is inactive. */ raw_spin_lock_irqsave(&p->pi_lock, flags); do_set_cpus_allowed(p, affinity); raw_spin_unlock_irqrestore(&p->pi_lock, flags); mutex_unlock(&kthreads_hotplug_lock); out: free_cpumask_var(affinity); return ret; } EXPORT_SYMBOL_GPL(kthread_affine_preferred); /* * Re-affine kthreads according to their preferences * and the newly online CPU. The CPU down part is handled * by select_fallback_rq() which default re-affines to * housekeepers from other nodes in case the preferred * affinity doesn't apply anymore. */ static int kthreads_online_cpu(unsigned int cpu) { cpumask_var_t affinity; struct kthread *k; int ret; guard(mutex)(&kthreads_hotplug_lock); if (list_empty(&kthreads_hotplug)) return 0; if (!zalloc_cpumask_var(&affinity, GFP_KERNEL)) return -ENOMEM; ret = 0; list_for_each_entry(k, &kthreads_hotplug, hotplug_node) { if (WARN_ON_ONCE((k->task->flags & PF_NO_SETAFFINITY) || kthread_is_per_cpu(k->task))) { ret = -EINVAL; continue; } kthread_fetch_affinity(k, affinity); set_cpus_allowed_ptr(k->task, affinity); } free_cpumask_var(affinity); return ret; } static int kthreads_init(void) { return cpuhp_setup_state(CPUHP_AP_KTHREADS_ONLINE, "kthreads:online", kthreads_online_cpu, NULL); } early_initcall(kthreads_init); void __kthread_init_worker(struct kthread_worker *worker, const char *name, struct lock_class_key *key) { memset(worker, 0, sizeof(struct kthread_worker)); raw_spin_lock_init(&worker->lock); lockdep_set_class_and_name(&worker->lock, key, name); INIT_LIST_HEAD(&worker->work_list); INIT_LIST_HEAD(&worker->delayed_work_list); } EXPORT_SYMBOL_GPL(__kthread_init_worker); /** * kthread_worker_fn - kthread function to process kthread_worker * @worker_ptr: pointer to initialized kthread_worker * * This function implements the main cycle of kthread worker. It processes * work_list until it is stopped with kthread_stop(). It sleeps when the queue * is empty. * * The works are not allowed to keep any locks, disable preemption or interrupts * when they finish. There is defined a safe point for freezing when one work * finishes and before a new one is started. * * Also the works must not be handled by more than one worker at the same time, * see also kthread_queue_work(). */ int kthread_worker_fn(void *worker_ptr) { struct kthread_worker *worker = worker_ptr; struct kthread_work *work; /* * FIXME: Update the check and remove the assignment when all kthread * worker users are created using kthread_create_worker*() functions. */ WARN_ON(worker->task && worker->task != current); worker->task = current; if (worker->flags & KTW_FREEZABLE) set_freezable(); repeat: set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ if (kthread_should_stop()) { __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&worker->lock); worker->task = NULL; raw_spin_unlock_irq(&worker->lock); return 0; } work = NULL; raw_spin_lock_irq(&worker->lock); if (!list_empty(&worker->work_list)) { work = list_first_entry(&worker->work_list, struct kthread_work, node); list_del_init(&work->node); } worker->current_work = work; raw_spin_unlock_irq(&worker->lock); if (work) { kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); trace_sched_kthread_work_execute_start(work); work->func(work); /* * Avoid dereferencing work after this point. The trace * event only cares about the address. */ trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) { schedule(); } else { /* * Handle the case where the current remains * TASK_INTERRUPTIBLE. try_to_freeze() expects * the current to be TASK_RUNNING. */ __set_current_state(TASK_RUNNING); } try_to_freeze(); cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); static __printf(3, 0) struct kthread_worker * __kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], va_list args) { struct kthread_worker *worker; struct task_struct *task; worker = kzalloc(sizeof(*worker), GFP_KERNEL); if (!worker) return ERR_PTR(-ENOMEM); kthread_init_worker(worker); task = __kthread_create_on_node(kthread_worker_fn, worker, node, namefmt, args); if (IS_ERR(task)) goto fail_task; worker->flags = flags; worker->task = task; return worker; fail_task: kfree(worker); return ERR_CAST(task); } /** * kthread_create_worker_on_node - create a kthread worker * @flags: flags modifying the default behavior of the worker * @node: task structure for the thread is allocated on this node * @namefmt: printf-style name for the kthread worker (task). * * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_node(unsigned int flags, int node, const char namefmt[], ...) { struct kthread_worker *worker; va_list args; va_start(args, namefmt); worker = __kthread_create_worker_on_node(flags, node, namefmt, args); va_end(args); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_node); /** * kthread_create_worker_on_cpu - create a kthread worker and bind it * to a given CPU and the associated NUMA node. * @cpu: CPU number * @flags: flags modifying the default behavior of the worker * @namefmt: printf-style name for the thread. Format is restricted * to "name.*%u". Code fills in cpu number. * * Use a valid CPU number if you want to bind the kthread worker * to the given CPU and the associated NUMA node. * * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * * CPU hotplug: * The kthread worker API is simple and generic. It just provides a way * to create, use, and destroy workers. * * It is up to the API user how to handle CPU hotplug. They have to decide * how to handle pending work items, prevent queuing new ones, and * restore the functionality when the CPU goes off and on. There are a * few catches: * * - CPU affinity gets lost when it is scheduled on an offline CPU. * * - The worker might not exist when the CPU was off when the user * created the workers. * * Good practice is to implement two CPU hotplug callbacks and to * destroy/create the worker when the CPU goes down/up. * * Return: * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the caller was killed by a fatal signal. */ struct kthread_worker * kthread_create_worker_on_cpu(int cpu, unsigned int flags, const char namefmt[]) { struct kthread_worker *worker; worker = kthread_create_worker_on_node(flags, cpu_to_node(cpu), namefmt, cpu); if (!IS_ERR(worker)) kthread_bind(worker->task, cpu); return worker; } EXPORT_SYMBOL(kthread_create_worker_on_cpu); /* * Returns true when the work could not be queued at the moment. * It happens when it is already pending in a worker list * or when it is being cancelled. */ static inline bool queuing_blocked(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); return !list_empty(&work->node) || work->canceling; } static void kthread_insert_work_sanity_check(struct kthread_worker *worker, struct kthread_work *work) { lockdep_assert_held(&worker->lock); WARN_ON_ONCE(!list_empty(&work->node)); /* Do not use a work with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker && work->worker != worker); } /* insert @work before @pos in @worker */ static void kthread_insert_work(struct kthread_worker *worker, struct kthread_work *work, struct list_head *pos) { kthread_insert_work_sanity_check(worker, work); trace_sched_kthread_work_queue_work(worker, work); list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) wake_up_process(worker->task); } /** * kthread_queue_work - queue a kthread_work * @worker: target kthread_worker * @work: kthread_work to queue * * Queue @work to work processor @task for async execution. @task * must have been created with kthread_create_worker(). Returns %true * if @work was successfully queued, %false if it was already pending. * * Reinitialize the work if it needs to be used by another worker. * For example, when the worker was stopped and started again. */ bool kthread_queue_work(struct kthread_worker *worker, struct kthread_work *work) { bool ret = false; unsigned long flags; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { kthread_insert_work(worker, work, &worker->work_list); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_work); /** * kthread_delayed_work_timer_fn - callback that queues the associated kthread * delayed work when the timer expires. * @t: pointer to the expired timer * * The format of the function is defined by struct timer_list. * It should have been called from irqsafe timer with irq already off. */ void kthread_delayed_work_timer_fn(struct timer_list *t) { struct kthread_delayed_work *dwork = timer_container_of(dwork, t, timer); struct kthread_work *work = &dwork->work; struct kthread_worker *worker = work->worker; unsigned long flags; /* * This might happen when a pending work is reinitialized. * It means that it is used a wrong way. */ if (WARN_ON_ONCE(!worker)) return; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); if (!work->canceling) kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } EXPORT_SYMBOL(kthread_delayed_work_timer_fn); static void __kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct timer_list *timer = &dwork->timer; struct kthread_work *work = &dwork->work; WARN_ON_ONCE(timer->function != kthread_delayed_work_timer_fn); /* * If @delay is 0, queue @dwork->work immediately. This is for * both optimization and correctness. The earliest @timer can * expire is on the closest next tick and delayed_work users depend * on that there's no such delay when @delay is 0. */ if (!delay) { kthread_insert_work(worker, work, &worker->work_list); return; } /* Be paranoid and try to detect possible races already now. */ kthread_insert_work_sanity_check(worker, work); list_add(&work->node, &worker->delayed_work_list); work->worker = worker; timer->expires = jiffies + delay; add_timer(timer); } /** * kthread_queue_delayed_work - queue the associated kthread work * after a delay. * @worker: target kthread_worker * @dwork: kthread_delayed_work to queue * @delay: number of jiffies to wait before queuing * * If the work has not been pending it starts a timer that will queue * the work after the given @delay. If @delay is zero, it queues the * work immediately. * * Return: %false if the @work has already been pending. It means that * either the timer was running or the work was queued. It returns %true * otherwise. */ bool kthread_queue_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; bool ret = false; raw_spin_lock_irqsave(&worker->lock, flags); if (!queuing_blocked(worker, work)) { __kthread_queue_delayed_work(worker, dwork, delay); ret = true; } raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_queue_delayed_work); struct kthread_flush_work { struct kthread_work work; struct completion done; }; static void kthread_flush_work_fn(struct kthread_work *work) { struct kthread_flush_work *fwork = container_of(work, struct kthread_flush_work, work); complete(&fwork->done); } /** * kthread_flush_work - flush a kthread_work * @work: work to flush * * If @work is queued or executing, wait for it to finish execution. */ void kthread_flush_work(struct kthread_work *work) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; struct kthread_worker *worker; bool noop = false; worker = work->worker; if (!worker) return; raw_spin_lock_irq(&worker->lock); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (!list_empty(&work->node)) kthread_insert_work(worker, &fwork.work, work->node.next); else if (worker->current_work == work) kthread_insert_work(worker, &fwork.work, worker->work_list.next); else noop = true; raw_spin_unlock_irq(&worker->lock); if (!noop) wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_work); /* * Make sure that the timer is neither set nor running and could * not manipulate the work list_head any longer. * * The function is called under worker->lock. The lock is temporary * released but the timer can't be set again in the meantime. */ static void kthread_cancel_delayed_work_timer(struct kthread_work *work, unsigned long *flags) { struct kthread_delayed_work *dwork = container_of(work, struct kthread_delayed_work, work); struct kthread_worker *worker = work->worker; /* * timer_delete_sync() must be called to make sure that the timer * callback is not running. The lock must be temporary released * to avoid a deadlock with the callback. In the meantime, * any queuing is blocked by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, *flags); timer_delete_sync(&dwork->timer); raw_spin_lock_irqsave(&worker->lock, *flags); work->canceling--; } /* * This function removes the work from the worker queue. * * It is called under worker->lock. The caller must make sure that * the timer used by delayed work is not running, e.g. by calling * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. * * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ static bool __kthread_cancel_work(struct kthread_work *work) { /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. */ if (!list_empty(&work->node)) { list_del_init(&work->node); return true; } return false; } /** * kthread_mod_delayed_work - modify delay of or queue a kthread delayed work * @worker: kthread worker to use * @dwork: kthread delayed work to queue * @delay: number of jiffies to wait before queuing * * If @dwork is idle, equivalent to kthread_queue_delayed_work(). Otherwise, * modify @dwork's timer so that it expires after @delay. If @delay is zero, * @work is guaranteed to be queued immediately. * * Return: %false if @dwork was idle and queued, %true otherwise. * * A special case is when the work is being canceled in parallel. * It might be caused either by the real kthread_cancel_delayed_work_sync() * or yet another kthread_mod_delayed_work() call. We let the other command * win and return %true here. The return value can be used for reference * counting and the number of queued works stays the same. Anyway, the caller * is supposed to synchronize these operations a reasonable way. * * This function is safe to call from any context including IRQ handler. * See __kthread_cancel_work() and kthread_delayed_work_timer_fn() * for details. */ bool kthread_mod_delayed_work(struct kthread_worker *worker, struct kthread_delayed_work *dwork, unsigned long delay) { struct kthread_work *work = &dwork->work; unsigned long flags; int ret; raw_spin_lock_irqsave(&worker->lock, flags); /* Do not bother with canceling when never queued. */ if (!work->worker) { ret = false; goto fast_queue; } /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); /* * Temporary cancel the work but do not fight with another command * that is canceling the work as well. * * It is a bit tricky because of possible races with another * mod_delayed_work() and cancel_delayed_work() callers. * * The timer must be canceled first because worker->lock is released * when doing so. But the work can be removed from the queue (list) * only when it can be queued again so that the return value can * be used for reference counting. */ kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) { /* The number of works in the queue does not change. */ ret = true; goto out; } ret = __kthread_cancel_work(work); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: raw_spin_unlock_irqrestore(&worker->lock, flags); return ret; } EXPORT_SYMBOL_GPL(kthread_mod_delayed_work); static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) { struct kthread_worker *worker = work->worker; unsigned long flags; int ret = false; if (!worker) goto out; raw_spin_lock_irqsave(&worker->lock, flags); /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); if (is_dwork) kthread_cancel_delayed_work_timer(work, &flags); ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; /* * The work is in progress and we need to wait with the lock released. * In the meantime, block any queuing by setting the canceling counter. */ work->canceling++; raw_spin_unlock_irqrestore(&worker->lock, flags); kthread_flush_work(work); raw_spin_lock_irqsave(&worker->lock, flags); work->canceling--; out_fast: raw_spin_unlock_irqrestore(&worker->lock, flags); out: return ret; } /** * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish * @work: the kthread work to cancel * * Cancel @work and wait for its execution to finish. This function * can be used even if the work re-queues itself. On return from this * function, @work is guaranteed to be not pending or executing on any CPU. * * kthread_cancel_work_sync(&delayed_work->work) must not be used for * delayed_work's. Use kthread_cancel_delayed_work_sync() instead. * * The caller must ensure that the worker on which @work was last * queued can't be destroyed before this function returns. * * Return: %true if @work was pending, %false otherwise. */ bool kthread_cancel_work_sync(struct kthread_work *work) { return __kthread_cancel_work_sync(work, false); } EXPORT_SYMBOL_GPL(kthread_cancel_work_sync); /** * kthread_cancel_delayed_work_sync - cancel a kthread delayed work and * wait for it to finish. * @dwork: the kthread delayed work to cancel * * This is kthread_cancel_work_sync() for delayed works. * * Return: %true if @dwork was pending, %false otherwise. */ bool kthread_cancel_delayed_work_sync(struct kthread_delayed_work *dwork) { return __kthread_cancel_work_sync(&dwork->work, true); } EXPORT_SYMBOL_GPL(kthread_cancel_delayed_work_sync); /** * kthread_flush_worker - flush all current works on a kthread_worker * @worker: worker to flush * * Wait until all currently executing or pending works on @worker are * finished. */ void kthread_flush_worker(struct kthread_worker *worker) { struct kthread_flush_work fwork = { KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), COMPLETION_INITIALIZER_ONSTACK(fwork.done), }; kthread_queue_work(worker, &fwork.work); wait_for_completion(&fwork.done); } EXPORT_SYMBOL_GPL(kthread_flush_worker); /** * kthread_destroy_worker - destroy a kthread worker * @worker: worker to be destroyed * * Flush and destroy @worker. The simple flush is enough because the kthread * worker API is used only in trivial scenarios. There are no multi-step state * machines needed. * * Note that this function is not responsible for handling delayed work, so * caller should be responsible for queuing or canceling all delayed work items * before invoke this function. */ void kthread_destroy_worker(struct kthread_worker *worker) { struct task_struct *task; task = worker->task; if (WARN_ON(!task)) return; kthread_flush_worker(worker); kthread_stop(task); WARN_ON(!list_empty(&worker->delayed_work_list)); WARN_ON(!list_empty(&worker->work_list)); kfree(worker); } EXPORT_SYMBOL(kthread_destroy_worker); /** * kthread_use_mm - make the calling kthread operate on an address space * @mm: address space to operate on */ void kthread_use_mm(struct mm_struct *mm) { struct mm_struct *active_mm; struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(tsk->mm); /* * It is possible for mm to be the same as tsk->active_mm, but * we must still mmgrab(mm) and mmdrop_lazy_tlb(active_mm), * because these references are not equivalent. */ mmgrab(mm); task_lock(tsk); /* Hold off tlb flush IPIs while switching mm's */ local_irq_disable(); active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); #ifdef finish_arch_post_lock_switch finish_arch_post_lock_switch(); #endif /* * When a kthread starts operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after storing to tsk->mm, before accessing * user-space memory. A full memory barrier for membarrier * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by * mmdrop_lazy_tlb(). */ mmdrop_lazy_tlb(active_mm); } EXPORT_SYMBOL_GPL(kthread_use_mm); /** * kthread_unuse_mm - reverse the effect of kthread_use_mm() * @mm: address space to operate on */ void kthread_unuse_mm(struct mm_struct *mm) { struct task_struct *tsk = current; WARN_ON_ONCE(!(tsk->flags & PF_KTHREAD)); WARN_ON_ONCE(!tsk->mm); task_lock(tsk); /* * When a kthread stops operating on an address space, the loop * in membarrier_{private,global}_expedited() may not observe * that tsk->mm, and not issue an IPI. Membarrier requires a * memory barrier after accessing user-space memory, before * clearing tsk->mm. */ smp_mb__after_spinlock(); local_irq_disable(); tsk->mm = NULL; membarrier_update_current_mm(NULL); mmgrab_lazy_tlb(mm); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); task_unlock(tsk); mmdrop(mm); } EXPORT_SYMBOL_GPL(kthread_unuse_mm); #ifdef CONFIG_BLK_CGROUP /** * kthread_associate_blkcg - associate blkcg to current kthread * @css: the cgroup info * * Current thread must be a kthread. The thread is running jobs on behalf of * other threads. In some cases, we expect the jobs attach cgroup info of * original threads instead of that of current thread. This function stores * original thread's cgroup info in current kthread context for later * retrieval. */ void kthread_associate_blkcg(struct cgroup_subsys_state *css) { struct kthread *kthread; if (!(current->flags & PF_KTHREAD)) return; kthread = to_kthread(current); if (!kthread) return; if (kthread->blkcg_css) { css_put(kthread->blkcg_css); kthread->blkcg_css = NULL; } if (css) { css_get(css); kthread->blkcg_css = css; } } EXPORT_SYMBOL(kthread_associate_blkcg); /** * kthread_blkcg - get associated blkcg css of current kthread * * Current thread must be a kthread. */ struct cgroup_subsys_state *kthread_blkcg(void) { struct kthread *kthread; if (current->flags & PF_KTHREAD) { kthread = to_kthread(current); if (kthread) return kthread->blkcg_css; } return NULL; } #endif
18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 /* +++ deflate.c */ /* deflate.c -- compress data using the deflation algorithm * Copyright (C) 1995-1996 Jean-loup Gailly. * For conditions of distribution and use, see copyright notice in zlib.h */ /* * ALGORITHM * * The "deflation" process depends on being able to identify portions * of the input text which are identical to earlier input (within a * sliding window trailing behind the input currently being processed). * * The most straightforward technique turns out to be the fastest for * most input files: try all possible matches and select the longest. * The key feature of this algorithm is that insertions into the string * dictionary are very simple and thus fast, and deletions are avoided * completely. Insertions are performed at each input character, whereas * string matches are performed only when the previous match ends. So it * is preferable to spend more time in matches to allow very fast string * insertions and avoid deletions. The matching algorithm for small * strings is inspired from that of Rabin & Karp. A brute force approach * is used to find longer strings when a small match has been found. * A similar algorithm is used in comic (by Jan-Mark Wams) and freeze * (by Leonid Broukhis). * A previous version of this file used a more sophisticated algorithm * (by Fiala and Greene) which is guaranteed to run in linear amortized * time, but has a larger average cost, uses more memory and is patented. * However the F&G algorithm may be faster for some highly redundant * files if the parameter max_chain_length (described below) is too large. * * ACKNOWLEDGEMENTS * * The idea of lazy evaluation of matches is due to Jan-Mark Wams, and * I found it in 'freeze' written by Leonid Broukhis. * Thanks to many people for bug reports and testing. * * REFERENCES * * Deutsch, L.P.,"DEFLATE Compressed Data Format Specification". * Available in ftp://ds.internic.net/rfc/rfc1951.txt * * A description of the Rabin and Karp algorithm is given in the book * "Algorithms" by R. Sedgewick, Addison-Wesley, p252. * * Fiala,E.R., and Greene,D.H. * Data Compression with Finite Windows, Comm.ACM, 32,4 (1989) 490-595 * */ #include <linux/module.h> #include <linux/zutil.h> #include "defutil.h" /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC # include "../zlib_dfltcc/dfltcc_deflate.h" #else #define DEFLATE_RESET_HOOK(strm) do {} while (0) #define DEFLATE_HOOK(strm, flush, bstate) 0 #define DEFLATE_NEED_CHECKSUM(strm) 1 #define DEFLATE_DFLTCC_ENABLED() 0 #endif /* =========================================================================== * Function prototypes. */ typedef block_state (*compress_func) (deflate_state *s, int flush); /* Compression function. Returns the block state after the call. */ static void fill_window (deflate_state *s); static block_state deflate_stored (deflate_state *s, int flush); static block_state deflate_fast (deflate_state *s, int flush); static block_state deflate_slow (deflate_state *s, int flush); static void lm_init (deflate_state *s); static void putShortMSB (deflate_state *s, uInt b); static int read_buf (z_streamp strm, Byte *buf, unsigned size); static uInt longest_match (deflate_state *s, IPos cur_match); #ifdef DEBUG_ZLIB static void check_match (deflate_state *s, IPos start, IPos match, int length); #endif /* =========================================================================== * Local data */ #define NIL 0 /* Tail of hash chains */ #ifndef TOO_FAR # define TOO_FAR 4096 #endif /* Matches of length 3 are discarded if their distance exceeds TOO_FAR */ #define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1) /* Minimum amount of lookahead, except at the end of the input file. * See deflate.c for comments about the MIN_MATCH+1. */ /* Workspace to be allocated for deflate processing */ typedef struct deflate_workspace { /* State memory for the deflator */ deflate_state deflate_memory; #ifdef CONFIG_ZLIB_DFLTCC /* State memory for s390 hardware deflate */ struct dfltcc_deflate_state dfltcc_memory; #endif Byte *window_memory; Pos *prev_memory; Pos *head_memory; char *overlay_memory; } deflate_workspace; #ifdef CONFIG_ZLIB_DFLTCC /* dfltcc_state must be doubleword aligned for DFLTCC call */ static_assert(offsetof(struct deflate_workspace, dfltcc_memory) % 8 == 0); #endif /* Values for max_lazy_match, good_match and max_chain_length, depending on * the desired pack level (0..9). The values given below have been tuned to * exclude worst case performance for pathological files. Better values may be * found for specific files. */ typedef struct config_s { ush good_length; /* reduce lazy search above this match length */ ush max_lazy; /* do not perform lazy search above this match length */ ush nice_length; /* quit search above this match length */ ush max_chain; compress_func func; } config; static const config configuration_table[10] = { /* good lazy nice chain */ /* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */ /* 1 */ {4, 4, 8, 4, deflate_fast}, /* maximum speed, no lazy matches */ /* 2 */ {4, 5, 16, 8, deflate_fast}, /* 3 */ {4, 6, 32, 32, deflate_fast}, /* 4 */ {4, 4, 16, 16, deflate_slow}, /* lazy matches */ /* 5 */ {8, 16, 32, 32, deflate_slow}, /* 6 */ {8, 16, 128, 128, deflate_slow}, /* 7 */ {8, 32, 128, 256, deflate_slow}, /* 8 */ {32, 128, 258, 1024, deflate_slow}, /* 9 */ {32, 258, 258, 4096, deflate_slow}}; /* maximum compression */ /* Note: the deflate() code requires max_lazy >= MIN_MATCH and max_chain >= 4 * For deflate_fast() (levels <= 3) good is ignored and lazy has a different * meaning. */ /* =========================================================================== * Update a hash value with the given input byte * IN assertion: all calls to UPDATE_HASH are made with consecutive * input characters, so that a running hash key can be computed from the * previous key instead of complete recalculation each time. */ #define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask) /* =========================================================================== * Insert string str in the dictionary and set match_head to the previous head * of the hash chain (the most recent string with same hash key). Return * the previous length of the hash chain. * IN assertion: all calls to INSERT_STRING are made with consecutive * input characters and the first MIN_MATCH bytes of str are valid * (except for the last MIN_MATCH-1 bytes of the input file). */ #define INSERT_STRING(s, str, match_head) \ (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \ s->prev[(str) & s->w_mask] = match_head = s->head[s->ins_h], \ s->head[s->ins_h] = (Pos)(str)) /* =========================================================================== * Initialize the hash table (avoiding 64K overflow for 16 bit systems). * prev[] will be initialized on the fly. */ #define CLEAR_HASH(s) \ s->head[s->hash_size-1] = NIL; \ memset((char *)s->head, 0, (unsigned)(s->hash_size-1)*sizeof(*s->head)); /* ========================================================================= */ int zlib_deflateInit2( z_streamp strm, int level, int method, int windowBits, int memLevel, int strategy ) { deflate_state *s; int noheader = 0; deflate_workspace *mem; char *next; ush *overlay; /* We overlay pending_buf and d_buf+l_buf. This works since the average * output size for (length,distance) codes is <= 24 bits. */ if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; if (level == Z_DEFAULT_COMPRESSION) level = 6; mem = (deflate_workspace *) strm->workspace; if (windowBits < 0) { /* undocumented feature: suppress zlib header */ noheader = 1; windowBits = -windowBits; } if (memLevel < 1 || memLevel > MAX_MEM_LEVEL || method != Z_DEFLATED || windowBits < 9 || windowBits > 15 || level < 0 || level > 9 || strategy < 0 || strategy > Z_HUFFMAN_ONLY) { return Z_STREAM_ERROR; } /* * Direct the workspace's pointers to the chunks that were allocated * along with the deflate_workspace struct. */ next = (char *) mem; next += sizeof(*mem); #ifdef CONFIG_ZLIB_DFLTCC /* * DFLTCC requires the window to be page aligned. * Thus, we overallocate and take the aligned portion of the buffer. */ mem->window_memory = (Byte *) PTR_ALIGN(next, PAGE_SIZE); #else mem->window_memory = (Byte *) next; #endif next += zlib_deflate_window_memsize(windowBits); mem->prev_memory = (Pos *) next; next += zlib_deflate_prev_memsize(windowBits); mem->head_memory = (Pos *) next; next += zlib_deflate_head_memsize(memLevel); mem->overlay_memory = next; s = (deflate_state *) &(mem->deflate_memory); strm->state = (struct internal_state *)s; s->strm = strm; s->noheader = noheader; s->w_bits = windowBits; s->w_size = 1 << s->w_bits; s->w_mask = s->w_size - 1; s->hash_bits = memLevel + 7; s->hash_size = 1 << s->hash_bits; s->hash_mask = s->hash_size - 1; s->hash_shift = ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH); s->window = (Byte *) mem->window_memory; s->prev = (Pos *) mem->prev_memory; s->head = (Pos *) mem->head_memory; s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */ overlay = (ush *) mem->overlay_memory; s->pending_buf = (uch *) overlay; s->pending_buf_size = (ulg)s->lit_bufsize * (sizeof(ush)+2L); s->d_buf = overlay + s->lit_bufsize/sizeof(ush); s->l_buf = s->pending_buf + (1+sizeof(ush))*s->lit_bufsize; s->level = level; s->strategy = strategy; s->method = (Byte)method; return zlib_deflateReset(strm); } /* ========================================================================= */ int zlib_deflateReset( z_streamp strm ) { deflate_state *s; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; strm->total_in = strm->total_out = 0; strm->msg = NULL; strm->data_type = Z_UNKNOWN; s = (deflate_state *)strm->state; s->pending = 0; s->pending_out = s->pending_buf; if (s->noheader < 0) { s->noheader = 0; /* was set to -1 by deflate(..., Z_FINISH); */ } s->status = s->noheader ? BUSY_STATE : INIT_STATE; strm->adler = 1; s->last_flush = Z_NO_FLUSH; zlib_tr_init(s); lm_init(s); DEFLATE_RESET_HOOK(strm); return Z_OK; } /* ========================================================================= * Put a short in the pending buffer. The 16-bit value is put in MSB order. * IN assertion: the stream state is correct and there is enough room in * pending_buf. */ static void putShortMSB( deflate_state *s, uInt b ) { put_byte(s, (Byte)(b >> 8)); put_byte(s, (Byte)(b & 0xff)); } /* ========================================================================= */ int zlib_deflate( z_streamp strm, int flush ) { int old_flush; /* value of flush param for previous deflate call */ deflate_state *s; if (strm == NULL || strm->state == NULL || flush > Z_FINISH || flush < 0) { return Z_STREAM_ERROR; } s = (deflate_state *) strm->state; if ((strm->next_in == NULL && strm->avail_in != 0) || (s->status == FINISH_STATE && flush != Z_FINISH)) { return Z_STREAM_ERROR; } if (strm->avail_out == 0) return Z_BUF_ERROR; s->strm = strm; /* just in case */ old_flush = s->last_flush; s->last_flush = flush; /* Write the zlib header */ if (s->status == INIT_STATE) { uInt header = (Z_DEFLATED + ((s->w_bits-8)<<4)) << 8; uInt level_flags = (s->level-1) >> 1; if (level_flags > 3) level_flags = 3; header |= (level_flags << 6); if (s->strstart != 0) header |= PRESET_DICT; header += 31 - (header % 31); s->status = BUSY_STATE; putShortMSB(s, header); /* Save the adler32 of the preset dictionary: */ if (s->strstart != 0) { putShortMSB(s, (uInt)(strm->adler >> 16)); putShortMSB(s, (uInt)(strm->adler & 0xffff)); } strm->adler = 1L; } /* Flush as much pending output as possible */ if (s->pending != 0) { flush_pending(strm); if (strm->avail_out == 0) { /* Since avail_out is 0, deflate will be called again with * more output space, but possibly with both pending and * avail_in equal to zero. There won't be anything to do, * but this is not an error situation so make sure we * return OK instead of BUF_ERROR at next call of deflate: */ s->last_flush = -1; return Z_OK; } /* Make sure there is something to do and avoid duplicate consecutive * flushes. For repeated and useless calls with Z_FINISH, we keep * returning Z_STREAM_END instead of Z_BUFF_ERROR. */ } else if (strm->avail_in == 0 && flush <= old_flush && flush != Z_FINISH) { return Z_BUF_ERROR; } /* User must not provide more input after the first FINISH: */ if (s->status == FINISH_STATE && strm->avail_in != 0) { return Z_BUF_ERROR; } /* Start a new block or continue the current one. */ if (strm->avail_in != 0 || s->lookahead != 0 || (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) { block_state bstate; bstate = DEFLATE_HOOK(strm, flush, &bstate) ? bstate : (*(configuration_table[s->level].func))(s, flush); if (bstate == finish_started || bstate == finish_done) { s->status = FINISH_STATE; } if (bstate == need_more || bstate == finish_started) { if (strm->avail_out == 0) { s->last_flush = -1; /* avoid BUF_ERROR next call, see above */ } return Z_OK; /* If flush != Z_NO_FLUSH && avail_out == 0, the next call * of deflate should use the same flush parameter to make sure * that the flush is complete. So we don't have to output an * empty block here, this will be done at next call. This also * ensures that for a very small output buffer, we emit at most * one empty block. */ } if (bstate == block_done) { if (flush == Z_PARTIAL_FLUSH) { zlib_tr_align(s); } else if (flush == Z_PACKET_FLUSH) { /* Output just the 3-bit `stored' block type value, but not a zero length. */ zlib_tr_stored_type_only(s); } else { /* FULL_FLUSH or SYNC_FLUSH */ zlib_tr_stored_block(s, (char*)0, 0L, 0); /* For a full flush, this empty block will be recognized * as a special marker by inflate_sync(). */ if (flush == Z_FULL_FLUSH) { CLEAR_HASH(s); /* forget history */ } } flush_pending(strm); if (strm->avail_out == 0) { s->last_flush = -1; /* avoid BUF_ERROR at next call, see above */ return Z_OK; } } } Assert(strm->avail_out > 0, "bug2"); if (flush != Z_FINISH) return Z_OK; if (!s->noheader) { /* Write zlib trailer (adler32) */ putShortMSB(s, (uInt)(strm->adler >> 16)); putShortMSB(s, (uInt)(strm->adler & 0xffff)); } flush_pending(strm); /* If avail_out is zero, the application will call deflate again * to flush the rest. */ if (!s->noheader) { s->noheader = -1; /* write the trailer only once! */ } if (s->pending == 0) { Assert(s->bi_valid == 0, "bi_buf not flushed"); return Z_STREAM_END; } return Z_OK; } /* ========================================================================= */ int zlib_deflateEnd( z_streamp strm ) { int status; deflate_state *s; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; s = (deflate_state *) strm->state; status = s->status; if (status != INIT_STATE && status != BUSY_STATE && status != FINISH_STATE) { return Z_STREAM_ERROR; } strm->state = NULL; return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; } /* =========================================================================== * Read a new buffer from the current input stream, update the adler32 * and total number of bytes read. All deflate() input goes through * this function so some applications may wish to modify it to avoid * allocating a large strm->next_in buffer and copying from it. * (See also flush_pending()). */ static int read_buf( z_streamp strm, Byte *buf, unsigned size ) { unsigned len = strm->avail_in; if (len > size) len = size; if (len == 0) return 0; strm->avail_in -= len; if (!DEFLATE_NEED_CHECKSUM(strm)) {} else if (!((deflate_state *)(strm->state))->noheader) { strm->adler = zlib_adler32(strm->adler, strm->next_in, len); } memcpy(buf, strm->next_in, len); strm->next_in += len; strm->total_in += len; return (int)len; } /* =========================================================================== * Initialize the "longest match" routines for a new zlib stream */ static void lm_init( deflate_state *s ) { s->window_size = (ulg)2L*s->w_size; CLEAR_HASH(s); /* Set the default configuration parameters: */ s->max_lazy_match = configuration_table[s->level].max_lazy; s->good_match = configuration_table[s->level].good_length; s->nice_match = configuration_table[s->level].nice_length; s->max_chain_length = configuration_table[s->level].max_chain; s->strstart = 0; s->block_start = 0L; s->lookahead = 0; s->match_length = s->prev_length = MIN_MATCH-1; s->match_available = 0; s->ins_h = 0; } /* =========================================================================== * Set match_start to the longest match starting at the given string and * return its length. Matches shorter or equal to prev_length are discarded, * in which case the result is equal to prev_length and match_start is * garbage. * IN assertions: cur_match is the head of the hash chain for the current * string (strstart) and its distance is <= MAX_DIST, and prev_length >= 1 * OUT assertion: the match length is not greater than s->lookahead. */ /* For 80x86 and 680x0, an optimized version will be provided in match.asm or * match.S. The code will be functionally equivalent. */ static uInt longest_match( deflate_state *s, IPos cur_match /* current match */ ) { unsigned chain_length = s->max_chain_length;/* max hash chain length */ register Byte *scan = s->window + s->strstart; /* current string */ register Byte *match; /* matched string */ register int len; /* length of current match */ int best_len = s->prev_length; /* best match length so far */ int nice_match = s->nice_match; /* stop if match long enough */ IPos limit = s->strstart > (IPos)MAX_DIST(s) ? s->strstart - (IPos)MAX_DIST(s) : NIL; /* Stop when cur_match becomes <= limit. To simplify the code, * we prevent matches with the string of window index 0. */ Pos *prev = s->prev; uInt wmask = s->w_mask; #ifdef UNALIGNED_OK /* Compare two bytes at a time. Note: this is not always beneficial. * Try with and without -DUNALIGNED_OK to check. */ register Byte *strend = s->window + s->strstart + MAX_MATCH - 1; register ush scan_start = *(ush*)scan; register ush scan_end = *(ush*)(scan+best_len-1); #else register Byte *strend = s->window + s->strstart + MAX_MATCH; register Byte scan_end1 = scan[best_len-1]; register Byte scan_end = scan[best_len]; #endif /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. * It is easy to get rid of this optimization if necessary. */ Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever"); /* Do not waste too much time if we already have a good match: */ if (s->prev_length >= s->good_match) { chain_length >>= 2; } /* Do not look for matches beyond the end of the input. This is necessary * to make deflate deterministic. */ if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; Assert((ulg)s->strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead"); do { Assert(cur_match < s->strstart, "no future"); match = s->window + cur_match; /* Skip to next match if the match length cannot increase * or if the match length is less than 2: */ #if (defined(UNALIGNED_OK) && MAX_MATCH == 258) /* This code assumes sizeof(unsigned short) == 2. Do not use * UNALIGNED_OK if your compiler uses a different size. */ if (*(ush*)(match+best_len-1) != scan_end || *(ush*)match != scan_start) continue; /* It is not necessary to compare scan[2] and match[2] since they are * always equal when the other bytes match, given that the hash keys * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at * strstart+3, +5, ... up to strstart+257. We check for insufficient * lookahead only every 4th comparison; the 128th check will be made * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is * necessary to put more guard bytes at the end of the window, or * to check more often for insufficient lookahead. */ Assert(scan[2] == match[2], "scan[2]?"); scan++, match++; do { } while (*(ush*)(scan+=2) == *(ush*)(match+=2) && *(ush*)(scan+=2) == *(ush*)(match+=2) && *(ush*)(scan+=2) == *(ush*)(match+=2) && *(ush*)(scan+=2) == *(ush*)(match+=2) && scan < strend); /* The funny "do {}" generates better code on most compilers */ /* Here, scan <= window+strstart+257 */ Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); if (*scan == *match) scan++; len = (MAX_MATCH - 1) - (int)(strend-scan); scan = strend - (MAX_MATCH-1); #else /* UNALIGNED_OK */ if (match[best_len] != scan_end || match[best_len-1] != scan_end1 || *match != *scan || *++match != scan[1]) continue; /* The check at best_len-1 can be removed because it will be made * again later. (This heuristic is not always a win.) * It is not necessary to compare scan[2] and match[2] since they * are always equal when the other bytes match, given that * the hash keys are equal and that HASH_BITS >= 8. */ scan += 2, match++; Assert(*scan == *match, "match[2]?"); /* We check for insufficient lookahead only every 8th comparison; * the 256th check will be made at strstart+258. */ do { } while (*++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && *++scan == *++match && scan < strend); Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan"); len = MAX_MATCH - (int)(strend - scan); scan = strend - MAX_MATCH; #endif /* UNALIGNED_OK */ if (len > best_len) { s->match_start = cur_match; best_len = len; if (len >= nice_match) break; #ifdef UNALIGNED_OK scan_end = *(ush*)(scan+best_len-1); #else scan_end1 = scan[best_len-1]; scan_end = scan[best_len]; #endif } } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0); if ((uInt)best_len <= s->lookahead) return best_len; return s->lookahead; } #ifdef DEBUG_ZLIB /* =========================================================================== * Check that the match at match_start is indeed a match. */ static void check_match( deflate_state *s, IPos start, IPos match, int length ) { /* check that the match is indeed a match */ if (memcmp((char *)s->window + match, (char *)s->window + start, length)) { fprintf(stderr, " start %u, match %u, length %d\n", start, match, length); do { fprintf(stderr, "%c%c", s->window[match++], s->window[start++]); } while (--length != 0); z_error("invalid match"); } if (z_verbose > 1) { fprintf(stderr,"\\[%d,%d]", start-match, length); do { putc(s->window[start++], stderr); } while (--length != 0); } } #else # define check_match(s, start, match, length) #endif /* =========================================================================== * Fill the window when the lookahead becomes insufficient. * Updates strstart and lookahead. * * IN assertion: lookahead < MIN_LOOKAHEAD * OUT assertions: strstart <= window_size-MIN_LOOKAHEAD * At least one byte has been read, or avail_in == 0; reads are * performed for at least two bytes (required for the zip translate_eol * option -- not supported here). */ static void fill_window( deflate_state *s ) { register unsigned n, m; register Pos *p; unsigned more; /* Amount of free space at the end of the window. */ uInt wsize = s->w_size; do { more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart); /* Deal with !@#$% 64K limit: */ if (more == 0 && s->strstart == 0 && s->lookahead == 0) { more = wsize; } else if (more == (unsigned)(-1)) { /* Very unlikely, but possible on 16 bit machine if strstart == 0 * and lookahead == 1 (input done one byte at time) */ more--; /* If the window is almost full and there is insufficient lookahead, * move the upper half to the lower one to make room in the upper half. */ } else if (s->strstart >= wsize+MAX_DIST(s)) { memcpy((char *)s->window, (char *)s->window+wsize, (unsigned)wsize); s->match_start -= wsize; s->strstart -= wsize; /* we now have strstart >= MAX_DIST */ s->block_start -= (long) wsize; /* Slide the hash table (could be avoided with 32 bit values at the expense of memory usage). We slide even when level == 0 to keep the hash table consistent if we switch back to level > 0 later. (Using level 0 permanently is not an optimal usage of zlib, so we don't care about this pathological case.) */ n = s->hash_size; p = &s->head[n]; do { m = *--p; *p = (Pos)(m >= wsize ? m-wsize : NIL); } while (--n); n = wsize; p = &s->prev[n]; do { m = *--p; *p = (Pos)(m >= wsize ? m-wsize : NIL); /* If n is not on any hash chain, prev[n] is garbage but * its value will never be used. */ } while (--n); more += wsize; } if (s->strm->avail_in == 0) return; /* If there was no sliding: * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 && * more == window_size - lookahead - strstart * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1) * => more >= window_size - 2*WSIZE + 2 * In the BIG_MEM or MMAP case (not yet supported), * window_size == input_size + MIN_LOOKAHEAD && * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD. * Otherwise, window_size == 2*WSIZE so more >= 2. * If there was sliding, more >= WSIZE. So in all cases, more >= 2. */ Assert(more >= 2, "more < 2"); n = read_buf(s->strm, s->window + s->strstart + s->lookahead, more); s->lookahead += n; /* Initialize the hash value now that we have some input: */ if (s->lookahead >= MIN_MATCH) { s->ins_h = s->window[s->strstart]; UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); #if MIN_MATCH != 3 Call UPDATE_HASH() MIN_MATCH-3 more times #endif } /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage, * but this is not important since only literal bytes will be emitted. */ } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0); } /* =========================================================================== * Flush the current block, with given end-of-file flag. * IN assertion: strstart is set to the end of the current match. */ #define FLUSH_BLOCK_ONLY(s, eof) { \ zlib_tr_flush_block(s, (s->block_start >= 0L ? \ (char *)&s->window[(unsigned)s->block_start] : \ NULL), \ (ulg)((long)s->strstart - s->block_start), \ (eof)); \ s->block_start = s->strstart; \ flush_pending(s->strm); \ Tracev((stderr,"[FLUSH]")); \ } /* Same but force premature exit if necessary. */ #define FLUSH_BLOCK(s, eof) { \ FLUSH_BLOCK_ONLY(s, eof); \ if (s->strm->avail_out == 0) return (eof) ? finish_started : need_more; \ } /* =========================================================================== * Copy without compression as much as possible from the input stream, return * the current block state. * This function does not insert new strings in the dictionary since * uncompressible data is probably not useful. This function is used * only for the level=0 compression option. * NOTE: this function should be optimized to avoid extra copying from * window to pending_buf. */ static block_state deflate_stored( deflate_state *s, int flush ) { /* Stored blocks are limited to 0xffff bytes, pending_buf is limited * to pending_buf_size, and each stored block has a 5 byte header: */ ulg max_block_size = 0xffff; ulg max_start; if (max_block_size > s->pending_buf_size - 5) { max_block_size = s->pending_buf_size - 5; } /* Copy as much as possible from input to output: */ for (;;) { /* Fill the window as much as possible: */ if (s->lookahead <= 1) { Assert(s->strstart < s->w_size+MAX_DIST(s) || s->block_start >= (long)s->w_size, "slide too late"); fill_window(s); if (s->lookahead == 0 && flush == Z_NO_FLUSH) return need_more; if (s->lookahead == 0) break; /* flush the current block */ } Assert(s->block_start >= 0L, "block gone"); s->strstart += s->lookahead; s->lookahead = 0; /* Emit a stored block if pending_buf will be full: */ max_start = s->block_start + max_block_size; if (s->strstart == 0 || (ulg)s->strstart >= max_start) { /* strstart == 0 is possible when wraparound on 16-bit machine */ s->lookahead = (uInt)(s->strstart - max_start); s->strstart = (uInt)max_start; FLUSH_BLOCK(s, 0); } /* Flush if we may have to slide, otherwise block_start may become * negative and the data will be gone: */ if (s->strstart - (uInt)s->block_start >= MAX_DIST(s)) { FLUSH_BLOCK(s, 0); } } FLUSH_BLOCK(s, flush == Z_FINISH); return flush == Z_FINISH ? finish_done : block_done; } /* =========================================================================== * Compress as much as possible from the input stream, return the current * block state. * This function does not perform lazy evaluation of matches and inserts * new strings in the dictionary only for unmatched strings or for short * matches. It is used only for the fast compression options. */ static block_state deflate_fast( deflate_state *s, int flush ) { IPos hash_head = NIL; /* head of the hash chain */ int bflush; /* set if current block must be flushed */ for (;;) { /* Make sure that we always have enough lookahead, except * at the end of the input file. We need MAX_MATCH bytes * for the next match, plus MIN_MATCH bytes to insert the * string following the next match. */ if (s->lookahead < MIN_LOOKAHEAD) { fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { return need_more; } if (s->lookahead == 0) break; /* flush the current block */ } /* Insert the string window[strstart .. strstart+2] in the * dictionary, and set hash_head to the head of the hash chain: */ if (s->lookahead >= MIN_MATCH) { INSERT_STRING(s, s->strstart, hash_head); } /* Find the longest match, discarding those <= prev_length. * At this point we have always match_length < MIN_MATCH */ if (hash_head != NIL && s->strstart - hash_head <= MAX_DIST(s)) { /* To simplify the code, we prevent matches with the string * of window index 0 (in particular we have to avoid a match * of the string with itself at the start of the input file). */ if (s->strategy != Z_HUFFMAN_ONLY) { s->match_length = longest_match (s, hash_head); } /* longest_match() sets match_start */ } if (s->match_length >= MIN_MATCH) { check_match(s, s->strstart, s->match_start, s->match_length); bflush = zlib_tr_tally(s, s->strstart - s->match_start, s->match_length - MIN_MATCH); s->lookahead -= s->match_length; /* Insert new strings in the hash table only if the match length * is not too large. This saves time but degrades compression. */ if (s->match_length <= s->max_insert_length && s->lookahead >= MIN_MATCH) { s->match_length--; /* string at strstart already in hash table */ do { s->strstart++; INSERT_STRING(s, s->strstart, hash_head); /* strstart never exceeds WSIZE-MAX_MATCH, so there are * always MIN_MATCH bytes ahead. */ } while (--s->match_length != 0); s->strstart++; } else { s->strstart += s->match_length; s->match_length = 0; s->ins_h = s->window[s->strstart]; UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]); #if MIN_MATCH != 3 Call UPDATE_HASH() MIN_MATCH-3 more times #endif /* If lookahead < MIN_MATCH, ins_h is garbage, but it does not * matter since it will be recomputed at next deflate call. */ } } else { /* No match, output a literal byte */ Tracevv((stderr,"%c", s->window[s->strstart])); bflush = zlib_tr_tally (s, 0, s->window[s->strstart]); s->lookahead--; s->strstart++; } if (bflush) FLUSH_BLOCK(s, 0); } FLUSH_BLOCK(s, flush == Z_FINISH); return flush == Z_FINISH ? finish_done : block_done; } /* =========================================================================== * Same as above, but achieves better compression. We use a lazy * evaluation for matches: a match is finally adopted only if there is * no better match at the next window position. */ static block_state deflate_slow( deflate_state *s, int flush ) { IPos hash_head = NIL; /* head of hash chain */ int bflush; /* set if current block must be flushed */ /* Process the input block. */ for (;;) { /* Make sure that we always have enough lookahead, except * at the end of the input file. We need MAX_MATCH bytes * for the next match, plus MIN_MATCH bytes to insert the * string following the next match. */ if (s->lookahead < MIN_LOOKAHEAD) { fill_window(s); if (s->lookahead < MIN_LOOKAHEAD && flush == Z_NO_FLUSH) { return need_more; } if (s->lookahead == 0) break; /* flush the current block */ } /* Insert the string window[strstart .. strstart+2] in the * dictionary, and set hash_head to the head of the hash chain: */ if (s->lookahead >= MIN_MATCH) { INSERT_STRING(s, s->strstart, hash_head); } /* Find the longest match, discarding those <= prev_length. */ s->prev_length = s->match_length, s->prev_match = s->match_start; s->match_length = MIN_MATCH-1; if (hash_head != NIL && s->prev_length < s->max_lazy_match && s->strstart - hash_head <= MAX_DIST(s)) { /* To simplify the code, we prevent matches with the string * of window index 0 (in particular we have to avoid a match * of the string with itself at the start of the input file). */ if (s->strategy != Z_HUFFMAN_ONLY) { s->match_length = longest_match (s, hash_head); } /* longest_match() sets match_start */ if (s->match_length <= 5 && (s->strategy == Z_FILTERED || (s->match_length == MIN_MATCH && s->strstart - s->match_start > TOO_FAR))) { /* If prev_match is also MIN_MATCH, match_start is garbage * but we will ignore the current match anyway. */ s->match_length = MIN_MATCH-1; } } /* If there was a match at the previous step and the current * match is not better, output the previous match: */ if (s->prev_length >= MIN_MATCH && s->match_length <= s->prev_length) { uInt max_insert = s->strstart + s->lookahead - MIN_MATCH; /* Do not insert strings in hash table beyond this. */ check_match(s, s->strstart-1, s->prev_match, s->prev_length); bflush = zlib_tr_tally(s, s->strstart -1 - s->prev_match, s->prev_length - MIN_MATCH); /* Insert in hash table all strings up to the end of the match. * strstart-1 and strstart are already inserted. If there is not * enough lookahead, the last two strings are not inserted in * the hash table. */ s->lookahead -= s->prev_length-1; s->prev_length -= 2; do { if (++s->strstart <= max_insert) { INSERT_STRING(s, s->strstart, hash_head); } } while (--s->prev_length != 0); s->match_available = 0; s->match_length = MIN_MATCH-1; s->strstart++; if (bflush) FLUSH_BLOCK(s, 0); } else if (s->match_available) { /* If there was no match at the previous position, output a * single literal. If there was a match but the current match * is longer, truncate the previous match to a single literal. */ Tracevv((stderr,"%c", s->window[s->strstart-1])); if (zlib_tr_tally (s, 0, s->window[s->strstart-1])) { FLUSH_BLOCK_ONLY(s, 0); } s->strstart++; s->lookahead--; if (s->strm->avail_out == 0) return need_more; } else { /* There is no previous match to compare with, wait for * the next step to decide. */ s->match_available = 1; s->strstart++; s->lookahead--; } } Assert (flush != Z_NO_FLUSH, "no flush?"); if (s->match_available) { Tracevv((stderr,"%c", s->window[s->strstart-1])); zlib_tr_tally (s, 0, s->window[s->strstart-1]); s->match_available = 0; } FLUSH_BLOCK(s, flush == Z_FINISH); return flush == Z_FINISH ? finish_done : block_done; } int zlib_deflate_workspacesize(int windowBits, int memLevel) { if (windowBits < 0) /* undocumented feature: suppress zlib header */ windowBits = -windowBits; /* Since the return value is typically passed to vmalloc() unchecked... */ BUG_ON(memLevel < 1 || memLevel > MAX_MEM_LEVEL || windowBits < 9 || windowBits > 15); return sizeof(deflate_workspace) + zlib_deflate_window_memsize(windowBits) + zlib_deflate_prev_memsize(windowBits) + zlib_deflate_head_memsize(memLevel) + zlib_deflate_overlay_memsize(memLevel); } int zlib_deflate_dfltcc_enabled(void) { return DEFLATE_DFLTCC_ENABLED(); }
28 29 18 4 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 // SPDX-License-Identifier: GPL-2.0-only /* * STP SAP demux * * Copyright (c) 2008 Patrick McHardy <kaber@trash.net> */ #include <linux/mutex.h> #include <linux/skbuff.h> #include <linux/etherdevice.h> #include <linux/llc.h> #include <linux/slab.h> #include <linux/module.h> #include <net/llc.h> #include <net/llc_pdu.h> #include <net/stp.h> /* 01:80:c2:00:00:20 - 01:80:c2:00:00:2F */ #define GARP_ADDR_MIN 0x20 #define GARP_ADDR_MAX 0x2F #define GARP_ADDR_RANGE (GARP_ADDR_MAX - GARP_ADDR_MIN) static const struct stp_proto __rcu *garp_protos[GARP_ADDR_RANGE + 1] __read_mostly; static const struct stp_proto __rcu *stp_proto __read_mostly; static struct llc_sap *sap __read_mostly; static unsigned int sap_registered; static DEFINE_MUTEX(stp_proto_mutex); /* Called under rcu_read_lock from LLC */ static int stp_pdu_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { const struct ethhdr *eh = eth_hdr(skb); const struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); const struct stp_proto *proto; if (pdu->ssap != LLC_SAP_BSPAN || pdu->dsap != LLC_SAP_BSPAN || pdu->ctrl_1 != LLC_PDU_TYPE_U) goto err; if (eh->h_dest[5] >= GARP_ADDR_MIN && eh->h_dest[5] <= GARP_ADDR_MAX) { proto = rcu_dereference(garp_protos[eh->h_dest[5] - GARP_ADDR_MIN]); if (proto && !ether_addr_equal(eh->h_dest, proto->group_address)) goto err; } else proto = rcu_dereference(stp_proto); if (!proto) goto err; proto->rcv(proto, skb, dev); return 0; err: kfree_skb(skb); return 0; } int stp_proto_register(const struct stp_proto *proto) { int err = 0; mutex_lock(&stp_proto_mutex); if (sap_registered++ == 0) { sap = llc_sap_open(LLC_SAP_BSPAN, stp_pdu_rcv); if (!sap) { err = -ENOMEM; goto out; } } if (is_zero_ether_addr(proto->group_address)) rcu_assign_pointer(stp_proto, proto); else rcu_assign_pointer(garp_protos[proto->group_address[5] - GARP_ADDR_MIN], proto); out: mutex_unlock(&stp_proto_mutex); return err; } EXPORT_SYMBOL_GPL(stp_proto_register); void stp_proto_unregister(const struct stp_proto *proto) { mutex_lock(&stp_proto_mutex); if (is_zero_ether_addr(proto->group_address)) RCU_INIT_POINTER(stp_proto, NULL); else RCU_INIT_POINTER(garp_protos[proto->group_address[5] - GARP_ADDR_MIN], NULL); synchronize_rcu(); if (--sap_registered == 0) llc_sap_put(sap); mutex_unlock(&stp_proto_mutex); } EXPORT_SYMBOL_GPL(stp_proto_unregister); MODULE_DESCRIPTION("SAP demux for IEEE 802.1D Spanning Tree Protocol (STP)"); MODULE_LICENSE("GPL");
28 28 1 4 17 20 1 11 83 15 51 9500 89 1 187 9564 465 9918 187 9917 40 9917 40 9927 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_H #define _LINUX_SCHED_H /* * Define 'struct task_struct' and provide the main scheduler * APIs (schedule(), wakeup variants, etc.) */ #include <uapi/linux/sched.h> #include <asm/current.h> #include <asm/processor.h> #include <linux/thread_info.h> #include <linux/preempt.h> #include <linux/cpumask_types.h> #include <linux/cache.h> #include <linux/irqflags_types.h> #include <linux/smp_types.h> #include <linux/pid_types.h> #include <linux/sem_types.h> #include <linux/shm.h> #include <linux/kmsan_types.h> #include <linux/mutex_types.h> #include <linux/plist_types.h> #include <linux/hrtimer_types.h> #include <linux/timer_types.h> #include <linux/seccomp_types.h> #include <linux/nodemask_types.h> #include <linux/refcount_types.h> #include <linux/resource.h> #include <linux/latencytop.h> #include <linux/sched/prio.h> #include <linux/sched/types.h> #include <linux/signal_types.h> #include <linux/spinlock.h> #include <linux/syscall_user_dispatch_types.h> #include <linux/mm_types_task.h> #include <linux/netdevice_xmit.h> #include <linux/task_io_accounting.h> #include <linux/posix-timers_types.h> #include <linux/restart_block.h> #include <uapi/linux/rseq.h> #include <linux/seqlock_types.h> #include <linux/kcsan.h> #include <linux/rv.h> #include <linux/uidgid_types.h> #include <linux/tracepoint-defs.h> #include <linux/unwind_deferred_types.h> #include <asm/kmap_size.h> #ifndef COMPILE_OFFSETS #include <generated/rq-offsets.h> #endif /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; struct bio_list; struct blk_plug; struct bpf_local_storage; struct bpf_run_ctx; struct bpf_net_context; struct capture_control; struct cfs_rq; struct fs_struct; struct futex_pi_state; struct io_context; struct io_uring_task; struct mempolicy; struct nameidata; struct nsproxy; struct perf_event_context; struct perf_ctx_data; struct pid_namespace; struct pipe_inode_info; struct rcu_node; struct reclaim_state; struct robust_list_head; struct root_domain; struct rq; struct sched_attr; struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; struct task_struct; struct user_event_mm; #include <linux/sched/ext.h> /* * Task state bitmask. NOTE! These bits are also * encoded in fs/proc/array.c: get_task_state(). * * We have two separate sets of flags: task->__state * is about runnability, while task->exit_state are * about the task exiting. Confusing, but this way * modifying one set can't modify the other one by * mistake. */ /* Used in tsk->__state: */ #define TASK_RUNNING 0x00000000 #define TASK_INTERRUPTIBLE 0x00000001 #define TASK_UNINTERRUPTIBLE 0x00000002 #define __TASK_STOPPED 0x00000004 #define __TASK_TRACED 0x00000008 /* Used in tsk->exit_state: */ #define EXIT_DEAD 0x00000010 #define EXIT_ZOMBIE 0x00000020 #define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* Used in tsk->__state again: */ #define TASK_PARKED 0x00000040 #define TASK_DEAD 0x00000080 #define TASK_WAKEKILL 0x00000100 #define TASK_WAKING 0x00000200 #define TASK_NOLOAD 0x00000400 #define TASK_NEW 0x00000800 #define TASK_RTLOCK_WAIT 0x00001000 #define TASK_FREEZABLE 0x00002000 #define __TASK_FREEZABLE_UNSAFE (0x00004000 * IS_ENABLED(CONFIG_LOCKDEP)) #define TASK_FROZEN 0x00008000 #define TASK_STATE_MAX 0x00010000 #define TASK_ANY (TASK_STATE_MAX-1) /* * DO NOT ADD ANY NEW USERS ! */ #define TASK_FREEZABLE_UNSAFE (TASK_FREEZABLE | __TASK_FREEZABLE_UNSAFE) /* Convenience macros for the sake of set_current_state: */ #define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE) #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED __TASK_TRACED #define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) /* Convenience macros for the sake of wake_up(): */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) /* get_task_state(): */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) #define task_is_traced(task) ((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0) #define task_is_stopped(task) ((READ_ONCE(task->jobctl) & JOBCTL_STOPPED) != 0) #define task_is_stopped_or_traced(task) ((READ_ONCE(task->jobctl) & (JOBCTL_STOPPED | JOBCTL_TRACED)) != 0) /* * Special states are those that do not use the normal wait-loop pattern. See * the comment with set_special_state(). */ #define is_special_task_state(state) \ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ TASK_DEAD | TASK_FROZEN)) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP # define debug_normal_state_change(state_value) \ do { \ WARN_ON_ONCE(is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_special_state_change(state_value) \ do { \ WARN_ON_ONCE(!is_special_task_state(state_value)); \ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_set_state() \ do { \ current->saved_state_change = current->task_state_change;\ current->task_state_change = _THIS_IP_; \ } while (0) # define debug_rtlock_wait_restore_state() \ do { \ current->task_state_change = current->saved_state_change;\ } while (0) #else # define debug_normal_state_change(cond) do { } while (0) # define debug_special_state_change(cond) do { } while (0) # define debug_rtlock_wait_set_state() do { } while (0) # define debug_rtlock_wait_restore_state() do { } while (0) #endif #define trace_set_current_state(state_value) \ do { \ if (tracepoint_enabled(sched_set_state_tp)) \ __trace_set_current_state(state_value); \ } while (0) /* * set_current_state() includes a barrier so that the write of current->__state * is correctly serialised wrt the caller's subsequent test of whether to * actually sleep: * * for (;;) { * set_current_state(TASK_UNINTERRUPTIBLE); * if (CONDITION) * break; * * schedule(); * } * __set_current_state(TASK_RUNNING); * * If the caller does not need such serialisation (because, for instance, the * CONDITION test and condition change and wakeup are under the same lock) then * use __set_current_state(). * * The above is typically ordered against the wakeup, which does: * * CONDITION = 1; * wake_up_state(p, TASK_UNINTERRUPTIBLE); * * where wake_up_state()/try_to_wake_up() executes a full memory barrier before * accessing p->__state. * * Wakeup will do: if (@state & p->__state) p->__state = TASK_RUNNING, that is, * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). * * However, with slightly different timing the wakeup TASK_RUNNING store can * also collide with the TASK_UNINTERRUPTIBLE store. Losing that store is not * a problem either because that will result in one extra go around the loop * and our @cond test will save the day. * * Also see the comments of try_to_wake_up(). */ #define __set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ } while (0) #define set_current_state(state_value) \ do { \ debug_normal_state_change((state_value)); \ trace_set_current_state(state_value); \ smp_store_mb(current->__state, (state_value)); \ } while (0) /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must * serialize against wakeups such that any possible in-flight TASK_RUNNING * stores will not collide with our state change. */ #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ \ raw_spin_lock_irqsave(&current->pi_lock, flags); \ debug_special_state_change((state_value)); \ trace_set_current_state(state_value); \ WRITE_ONCE(current->__state, (state_value)); \ raw_spin_unlock_irqrestore(&current->pi_lock, flags); \ } while (0) /* * PREEMPT_RT specific variants for "sleeping" spin/rwlocks * * RT's spin/rwlock substitutions are state preserving. The state of the * task when blocking on the lock is saved in task_struct::saved_state and * restored after the lock has been acquired. These operations are * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT * lock related wakeups while the task is blocked on the lock are * redirected to operate on task_struct::saved_state to ensure that these * are not dropped. On restore task_struct::saved_state is set to * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. * * The lock operation looks like this: * * current_save_and_set_rtlock_wait_state(); * for (;;) { * if (try_lock()) * break; * raw_spin_unlock_irq(&lock->wait_lock); * schedule_rtlock(); * raw_spin_lock_irq(&lock->wait_lock); * set_current_state(TASK_RTLOCK_WAIT); * } * current_restore_rtlock_saved_state(); */ #define current_save_and_set_rtlock_wait_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ current->saved_state = current->__state; \ debug_rtlock_wait_set_state(); \ trace_set_current_state(TASK_RTLOCK_WAIT); \ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define current_restore_rtlock_saved_state() \ do { \ lockdep_assert_irqs_disabled(); \ raw_spin_lock(&current->pi_lock); \ debug_rtlock_wait_restore_state(); \ trace_set_current_state(current->saved_state); \ WRITE_ONCE(current->__state, current->saved_state); \ current->saved_state = TASK_RUNNING; \ raw_spin_unlock(&current->pi_lock); \ } while (0); #define get_current_state() READ_ONCE(current->__state) /* * Define the task command name length as enum, then it can be visible to * BPF programs. */ enum { TASK_COMM_LEN = 16, }; extern void sched_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern long schedule_timeout(long timeout); extern long schedule_timeout_interruptible(long timeout); extern long schedule_timeout_killable(long timeout); extern long schedule_timeout_uninterruptible(long timeout); extern long schedule_timeout_idle(long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); asmlinkage void preempt_schedule_irq(void); #ifdef CONFIG_PREEMPT_RT extern void schedule_rtlock(void); #endif extern int __must_check io_schedule_prepare(void); extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); extern void io_schedule(void); /* wrapper functions to trace from this header file */ DECLARE_TRACEPOINT(sched_set_state_tp); extern void __trace_set_current_state(int state_value); DECLARE_TRACEPOINT(sched_set_need_resched_tp); extern void __trace_set_need_resched(struct task_struct *curr, int tif); /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode * @lock: protects the above two fields * * Stores previous user/system time values such that we can guarantee * monotonicity. */ struct prev_cputime { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE u64 utime; u64 stime; raw_spinlock_t lock; #endif }; enum vtime_state { /* Task is sleeping or running in a CPU with VTIME inactive: */ VTIME_INACTIVE = 0, /* Task is idle */ VTIME_IDLE, /* Task runs in kernelspace in a CPU with VTIME active: */ VTIME_SYS, /* Task runs in userspace in a CPU with VTIME active: */ VTIME_USER, /* Task runs as guests in a CPU with VTIME active: */ VTIME_GUEST, }; struct vtime { seqcount_t seqcount; unsigned long long starttime; enum vtime_state state; unsigned int cpu; u64 utime; u64 stime; u64 gtime; }; /* * Utilization clamp constraints. * @UCLAMP_MIN: Minimum utilization * @UCLAMP_MAX: Maximum utilization * @UCLAMP_CNT: Utilization clamp constraints count */ enum uclamp_id { UCLAMP_MIN = 0, UCLAMP_MAX, UCLAMP_CNT }; extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; extern void sched_domains_mutex_lock(void); extern void sched_domains_mutex_unlock(void); struct sched_param { int sched_priority; }; struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ /* # of times we have run on this CPU: */ unsigned long pcount; /* Time spent waiting on a runqueue: */ unsigned long long run_delay; /* Max time spent waiting on a runqueue: */ unsigned long long max_run_delay; /* Min time spent waiting on a runqueue: */ unsigned long long min_run_delay; /* Timestamps: */ /* When did we last run on a CPU? */ unsigned long long last_arrival; /* When were we last queued to run? */ unsigned long long last_queued; #endif /* CONFIG_SCHED_INFO */ }; /* * Integer metrics need fixed point arithmetic, e.g., sched/fair * has a few: load, load_avg, util_avg, freq, and capacity. * * We define a basic fixed point arithmetic range, and then formalize * all these metrics based on that basic range. */ # define SCHED_FIXEDPOINT_SHIFT 10 # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) /* Increase resolution of cpu_capacity calculations */ # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) struct load_weight { unsigned long weight; u32 inv_weight; }; /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). * * [load_avg definition] * * load_avg = runnable% * scale_load_down(load) * * [runnable_avg definition] * * runnable_avg = runnable% * SCHED_CAPACITY_SCALE * * [util_avg definition] * * util_avg = running% * SCHED_CAPACITY_SCALE * * where runnable% is the time ratio that a sched_entity is runnable and * running% the time ratio that a sched_entity is running. * * For cfs_rq, they are the aggregated values of all runnable and blocked * sched_entities. * * The load/runnable/util_avg doesn't directly factor frequency scaling and CPU * capacity scaling. The scaling is done through the rq_clock_pelt that is used * for computing those signals (see update_rq_clock_pelt()) * * N.B., the above ratios (runnable% and running%) themselves are in the * range of [0, 1]. To do fixed point arithmetics, we therefore scale them * to as large a range as necessary. This is for example reflected by * util_avg's SCHED_CAPACITY_SCALE. * * [Overflow issue] * * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities * with the highest load (=88761), always runnable on a single cfs_rq, * and should not overflow as the number already hits PID_MAX_LIMIT. * * For all other cases (including 32-bit kernels), struct load_weight's * weight will overflow first before we do, because: * * Max(load_avg) <= Max(load.weight) * * Then it is the load_weight's responsibility to consider overflow * issues. */ struct sched_avg { u64 last_update_time; u64 load_sum; u64 runnable_sum; u32 util_sum; u32 period_contrib; unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; unsigned int util_est; } ____cacheline_aligned; /* * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg * updates. When a task is dequeued, its util_est should not be updated if its * util_avg has not been updated in the meantime. * This information is mapped into the MSB bit of util_est at dequeue time. * Since max value of util_est for a task is 1024 (PELT util_avg for a task) * it is safe to use MSB. */ #define UTIL_EST_WEIGHT_SHIFT 2 #define UTIL_AVG_UNCHANGED 0x80000000 struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; u64 wait_count; u64 wait_sum; u64 iowait_count; u64 iowait_sum; u64 sleep_start; u64 sleep_max; s64 sum_sleep_runtime; u64 block_start; u64 block_max; s64 sum_block_runtime; s64 exec_max; u64 slice_max; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; u64 nr_forced_migrations; u64 nr_wakeups; u64 nr_wakeups_sync; u64 nr_wakeups_migrate; u64 nr_wakeups_local; u64 nr_wakeups_remote; u64 nr_wakeups_affine; u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; #ifdef CONFIG_SCHED_CORE u64 core_forceidle_sum; #endif #endif /* CONFIG_SCHEDSTATS */ } ____cacheline_aligned; struct sched_entity { /* For load-balancing: */ struct load_weight load; struct rb_node run_node; u64 deadline; u64 min_vruntime; u64 min_slice; struct list_head group_node; unsigned char on_rq; unsigned char sched_delayed; unsigned char rel_deadline; unsigned char custom_slice; /* hole */ u64 exec_start; u64 sum_exec_runtime; u64 prev_sum_exec_runtime; u64 vruntime; union { /* * When !@on_rq this field is vlag. * When cfs_rq->curr == se (which implies @on_rq) * this field is vprot. See protect_slice(). */ s64 vlag; u64 vprot; }; u64 slice; u64 nr_migrations; #ifdef CONFIG_FAIR_GROUP_SCHED int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; /* cached value of my_q->h_nr_running */ unsigned long runnable_weight; #endif /* * Per entity load average tracking. * * Put into separate cache line so it does not * collide with read-mostly values above. */ struct sched_avg avg; }; struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned long watchdog_stamp; unsigned int time_slice; unsigned short on_rq; unsigned short on_list; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED struct sched_rt_entity *parent; /* rq on which this entity is (to be) queued: */ struct rt_rq *rt_rq; /* rq "owned" by this entity/group: */ struct rt_rq *my_q; #endif } __randomize_layout; typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); struct sched_dl_entity { struct rb_node rb_node; /* * Original scheduling parameters. Copied here from sched_attr * during sched_setattr(), they will remain the same until * the next sched_setattr(). */ u64 dl_runtime; /* Maximum runtime for each instance */ u64 dl_deadline; /* Relative deadline of each instance */ u64 dl_period; /* Separation of two instances (period) */ u64 dl_bw; /* dl_runtime / dl_period */ u64 dl_density; /* dl_runtime / dl_deadline */ /* * Actual scheduling parameters. Initialized with the values above, * they are continuously updated during task execution. Note that * the remaining runtime could be < 0 in case we are in overrun. */ s64 runtime; /* Remaining runtime for this instance */ u64 deadline; /* Absolute deadline for this instance */ unsigned int flags; /* Specifying the scheduler behaviour */ /* * Some bool flags: * * @dl_throttled tells if we exhausted the runtime. If so, the * task has to wait for a replenishment to be performed at the * next firing of dl_timer. * * @dl_yielded tells if task gave up the CPU before consuming * all its available runtime during the last job. * * @dl_non_contending tells if the task is inactive while still * contributing to the active utilization. In other words, it * indicates if the inactive timer has been armed and its handler * has not been executed yet. This flag is useful to avoid race * conditions between the inactive timer handler and the wakeup * code. * * @dl_overrun tells if the task asked to be informed about runtime * overruns. * * @dl_server tells if this is a server entity. * * @dl_defer tells if this is a deferred or regular server. For * now only defer server exists. * * @dl_defer_armed tells if the deferrable server is waiting * for the replenishment timer to activate it. * * @dl_server_active tells if the dlserver is active(started). * dlserver is started on first cfs enqueue on an idle runqueue * and is stopped when a dequeue results in 0 cfs tasks on the * runqueue. In other words, dlserver is active only when cpu's * runqueue has atleast one cfs task. * * @dl_defer_running tells if the deferrable server is actually * running, skipping the defer phase. */ unsigned int dl_throttled : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; unsigned int dl_server : 1; unsigned int dl_server_active : 1; unsigned int dl_defer : 1; unsigned int dl_defer_armed : 1; unsigned int dl_defer_running : 1; /* * Bandwidth enforcement timer. Each -deadline task has its * own bandwidth to be enforced, thus we need one timer per task. */ struct hrtimer dl_timer; /* * Inactive timer, responsible for decreasing the active utilization * at the "0-lag time". When a -deadline task blocks, it contributes * to GRUB's active utilization until the "0-lag time", hence a * timer is needed to decrease the active utilization at the correct * time. */ struct hrtimer inactive_timer; /* * Bits for DL-server functionality. Also see the comment near * dl_server_update(). * * @rq the runqueue this server is for * * @server_has_tasks() returns true if @server_pick return a * runnable task. */ struct rq *rq; dl_server_pick_f server_pick_task; #ifdef CONFIG_RT_MUTEXES /* * Priority Inheritance. When a DEADLINE scheduling entity is boosted * pi_se points to the donor, otherwise points to the dl_se it belongs * to (the original one/itself). */ struct sched_dl_entity *pi_se; #endif }; #ifdef CONFIG_UCLAMP_TASK /* Number of utilization clamp buckets (shorter alias) */ #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT /* * Utilization clamp for a scheduling entity * @value: clamp value "assigned" to a se * @bucket_id: bucket index corresponding to the "assigned" value * @active: the se is currently refcounted in a rq's bucket * @user_defined: the requested clamp value comes from user-space * * The bucket_id is the index of the clamp bucket matching the clamp value * which is pre-computed and stored to avoid expensive integer divisions from * the fast path. * * The active bit is set whenever a task has got an "effective" value assigned, * which can be different from the clamp value "requested" from user-space. * This allows to know a task is refcounted in the rq's bucket corresponding * to the "effective" bucket_id. * * The user_defined bit is set whenever a task has got a task-specific clamp * value requested from userspace, i.e. the system defaults apply to this task * just as a restriction. This allows to relax default clamps when a less * restrictive task-specific value has been requested, thus allowing to * implement a "nice" semantic. For example, a task running with a 20% * default boost can still drop its own boosting to 0%. */ struct uclamp_se { unsigned int value : bits_per(SCHED_CAPACITY_SCALE); unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); unsigned int active : 1; unsigned int user_defined : 1; }; #endif /* CONFIG_UCLAMP_TASK */ union rcu_special { struct { u8 blocked; u8 need_qs; u8 exp_hint; /* Hint for performance. */ u8 need_mb; /* Readers need smp_mb(). */ } b; /* Bits. */ u32 s; /* Set of bits. */ }; enum perf_event_task_context { perf_invalid_context = -1, perf_hw_context = 0, perf_sw_context, perf_nr_task_contexts, }; /* * Number of contexts where an event can trigger: * task, softirq, hardirq, nmi. */ #define PERF_NR_CONTEXTS 4 struct wake_q_node { struct wake_q_node *next; }; struct kmap_ctrl { #ifdef CONFIG_KMAP_LOCAL int idx; pte_t pteval[KM_MAX_IDX]; #endif }; struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For reasons of header soup (see current_thread_info()), this * must be the first element of task_struct. */ struct thread_info thread_info; #endif unsigned int __state; /* saved state for "spinlock sleepers" */ unsigned int saved_state; /* * This begins the randomizable portion of task_struct. Only * scheduling-critical items should be added above here. */ randomized_struct_fields_start void *stack; refcount_t usage; /* Per task flags (PF_*), defined further below: */ unsigned int flags; unsigned int ptrace; #ifdef CONFIG_MEM_ALLOC_PROFILING struct alloc_tag *alloc_tag; #endif int on_cpu; struct __call_single_node wake_entry; unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; /* * recent_used_cpu is initially set as the last CPU used by a task * that wakes affine another task. Waker/wakee relationships can * push tasks around a CPU where each wakeup moves to the next one. * Tracking a recently used CPU allows a quick search for a recently * used CPU that may be idle. */ int recent_used_cpu; int wake_cpu; int on_rq; int prio; int static_prio; int normal_prio; unsigned int rt_priority; struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; struct sched_dl_entity *dl_server; #ifdef CONFIG_SCHED_CLASS_EXT struct sched_ext_entity scx; #endif const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE struct rb_node core_node; unsigned long core_cookie; unsigned int core_occupation; #endif #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #ifdef CONFIG_CFS_BANDWIDTH struct callback_head sched_throttle_work; struct list_head throttle_node; bool throttled; #endif #endif #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp_req[UCLAMP_CNT]; /* * Effective clamp values used for a scheduling entity. * Must be updated with task_rq_lock() held. */ struct uclamp_se uclamp[UCLAMP_CNT]; #endif struct sched_statistics stats; #ifdef CONFIG_PREEMPT_NOTIFIERS /* List of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif unsigned int policy; unsigned long max_allowed_capacity; int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t *user_cpus_ptr; cpumask_t cpus_mask; void *migration_pending; unsigned short migration_disabled; unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; struct rcu_node *rcu_blocked_node; #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU unsigned long rcu_tasks_nvcsw; u8 rcu_tasks_holdout; u8 rcu_tasks_idx; int rcu_tasks_idle_cpu; struct list_head rcu_tasks_holdout_list; int rcu_tasks_exit_cpu; struct list_head rcu_tasks_exit_list; #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU int trc_reader_nesting; int trc_ipi_to_cpu; union rcu_special trc_reader_special; struct list_head trc_holdout_list; struct list_head trc_blkd_node; int trc_blkd_cpu; #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ struct sched_info sched_info; struct list_head tasks; struct plist_node pushable_tasks; struct rb_node pushable_dl_tasks; struct mm_struct *mm; struct mm_struct *active_mm; struct address_space *faults_disabled_mapping; int exit_state; int exit_code; int exit_signal; /* The signal sent when the parent dies: */ int pdeath_signal; /* JOBCTL_*, siglock protected: */ unsigned long jobctl; /* Used for emulating ABI behavior of previous Linux versions: */ unsigned int personality; /* Scheduler bits, serialized by scheduler locks: */ unsigned sched_reset_on_fork:1; unsigned sched_contributes_to_load:1; unsigned sched_migrated:1; unsigned sched_task_hot:1; /* Force alignment to the next boundary: */ unsigned :0; /* Unserialized, strictly 'current' */ /* * This field must not be in the scheduler word above due to wakelist * queueing no longer being serialized by p->on_cpu. However: * * p->XXX = X; ttwu() * schedule() if (p->on_rq && ..) // false * smp_mb__after_spinlock(); if (smp_load_acquire(&p->on_cpu) && //true * deactivate_task() ttwu_queue_wakelist()) * p->on_rq = 0; p->sched_remote_wakeup = Y; * * guarantees all stores of 'current' are visible before * ->sched_remote_wakeup gets used, so it can be in this word. */ unsigned sched_remote_wakeup:1; #ifdef CONFIG_RT_MUTEXES unsigned sched_rt_mutex:1; #endif /* Bit to tell TOMOYO we're in execve(): */ unsigned in_execve:1; unsigned in_iowait:1; #ifndef TIF_RESTORE_SIGMASK unsigned restore_sigmask:1; #endif #ifdef CONFIG_MEMCG_V1 unsigned in_user_fault:1; #endif #ifdef CONFIG_LRU_GEN /* whether the LRU algorithm may apply to this access */ unsigned in_lru_fault:1; #endif #ifdef CONFIG_COMPAT_BRK unsigned brk_randomized:1; #endif #ifdef CONFIG_CGROUPS /* disallow userland-initiated cgroup migration */ unsigned no_cgroup_migration:1; /* task is frozen/stopped (used by the cgroup freezer) */ unsigned frozen:1; #endif #ifdef CONFIG_BLK_CGROUP unsigned use_memdelay:1; #endif #ifdef CONFIG_PSI /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif #ifdef CONFIG_PAGE_OWNER /* Used by page_owner=on to detect recursion in page tracking. */ unsigned in_page_owner:1; #endif #ifdef CONFIG_EVENTFD /* Recursion prevention for eventfd_signal() */ unsigned in_eventfd:1; #endif #ifdef CONFIG_ARCH_HAS_CPU_PASID unsigned pasid_activated:1; #endif #ifdef CONFIG_X86_BUS_LOCK_DETECT unsigned reported_split_lock:1; #endif #ifdef CONFIG_TASK_DELAY_ACCT /* delay due to memory thrashing */ unsigned in_thrashing:1; #endif unsigned in_nf_duplicate:1; #ifdef CONFIG_PREEMPT_RT struct netdev_xmit net_xmit; #endif unsigned long atomic_flags; /* Flags requiring atomic access. */ struct restart_block restart_block; pid_t pid; pid_t tgid; #ifdef CONFIG_STACKPROTECTOR /* Canary value for the -fstack-protector GCC feature: */ unsigned long stack_canary; #endif /* * Pointers to the (original) parent process, youngest child, younger sibling, * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ /* Real parent process: */ struct task_struct __rcu *real_parent; /* Recipient of SIGCHLD, wait4() reports: */ struct task_struct __rcu *parent; /* * Children/sibling form the list of natural children: */ struct list_head children; struct list_head sibling; struct task_struct *group_leader; /* * 'ptraced' is the list of tasks this task is using ptrace() on. * * This includes both natural children and PTRACE_ATTACH targets. * 'ptrace_entry' is this task's link on the p->parent->ptraced list. */ struct list_head ptraced; struct list_head ptrace_entry; /* PID/PID hash table linkage. */ struct pid *thread_pid; struct hlist_node pid_links[PIDTYPE_MAX]; struct list_head thread_node; struct completion *vfork_done; /* CLONE_CHILD_SETTID: */ int __user *set_child_tid; /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; /* PF_KTHREAD | PF_IO_WORKER */ void *worker_private; u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME u64 utimescaled; u64 stimescaled; #endif u64 gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN struct vtime vtime; #endif #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif /* Context switch counts: */ unsigned long nvcsw; unsigned long nivcsw; /* Monotonic time in nsecs: */ u64 start_time; /* Boot based time in nsecs: */ u64 start_boottime; /* MM fault and swap info: this can arguably be seen as either mm-specific or thread-specific: */ unsigned long min_flt; unsigned long maj_flt; /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK struct posix_cputimers_work posix_cputimers_work; #endif /* Process credentials: */ /* Tracer's credentials at attach: */ const struct cred __rcu *ptracer_cred; /* Objective and real subjective task credentials (COW): */ const struct cred __rcu *real_cred; /* Effective (overridable) subjective task credentials (COW): */ const struct cred __rcu *cred; #ifdef CONFIG_KEYS /* Cached requested key. */ struct key *cached_requested_key; #endif /* * executable name, excluding path. * * - normally initialized begin_new_exec() * - set it with set_task_comm() * - strscpy_pad() to ensure it is always NUL-terminated and * zero-padded * - task_lock() to ensure the operation is atomic and the name is * fully updated. */ char comm[TASK_COMM_LEN]; struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC struct sysv_sem sysvsem; struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK unsigned long last_switch_count; unsigned long last_switch_time; #endif /* Filesystem information: */ struct fs_struct *fs; /* Open file information: */ struct files_struct *files; #ifdef CONFIG_IO_URING struct io_uring_task *io_uring; #endif /* Namespaces: */ struct nsproxy *nsproxy; /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; struct callback_head *task_works; #ifdef CONFIG_AUDIT #ifdef CONFIG_AUDITSYSCALL struct audit_context *audit_context; #endif kuid_t loginuid; unsigned int sessionid; #endif struct seccomp seccomp; struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; u64 self_exec_id; /* Protection against (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed, mempolicy: */ spinlock_t alloc_lock; /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; struct wake_q_node wake_q; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root_cached pi_waiters; /* Updated under owner's pi_lock and rq lock */ struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif struct mutex *blocked_on; /* lock we're blocked on */ #ifdef CONFIG_DETECT_HUNG_TASK_BLOCKER /* * Encoded lock address causing task block (lower 2 bits = type from * <linux/hung_task.h>). Accessed via hung_task_*() helpers. */ unsigned long blocker; #endif #ifdef CONFIG_DEBUG_ATOMIC_SLEEP int non_block_count; #endif #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events irqtrace; unsigned int hardirq_threaded; u64 hardirq_chain_key; int softirqs_enabled; int softirq_context; int irq_config; #endif #ifdef CONFIG_PREEMPT_RT int softirq_disable_cnt; #endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; struct held_lock held_locks[MAX_LOCK_DEPTH]; #endif #if defined(CONFIG_UBSAN) && !defined(CONFIG_UBSAN_TRAP) unsigned int in_ubsan; #endif /* Journalling filesystem info: */ void *journal_info; /* Stacked block device info: */ struct bio_list *bio_list; /* Stack plugging: */ struct blk_plug *plug; /* VM state: */ struct reclaim_state *reclaim_state; struct io_context *io_context; #ifdef CONFIG_COMPACTION struct capture_control *capture_control; #endif /* Ptrace state: */ unsigned long ptrace_message; kernel_siginfo_t *last_siginfo; struct task_io_accounting ioac; #ifdef CONFIG_PSI /* Pressure stall state */ unsigned int psi_flags; #endif #ifdef CONFIG_TASK_XACCT /* Accumulated RSS usage: */ u64 acct_rss_mem1; /* Accumulated virtual memory usage: */ u64 acct_vm_mem1; /* stime + utime since last update: */ u64 acct_timexpd; #endif #ifdef CONFIG_CPUSETS /* Protected by ->alloc_lock: */ nodemask_t mems_allowed; /* Sequence number to catch updates: */ seqcount_spinlock_t mems_allowed_seq; int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CGROUPS /* Control Group info protected by css_set_lock: */ struct css_set __rcu *cgroups; /* cg_list protected by css_set_lock and tsk->alloc_lock: */ struct list_head cg_list; #endif #ifdef CONFIG_X86_CPU_RESCTRL u32 closid; u32 rmid; #endif #ifdef CONFIG_FUTEX struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; struct mutex futex_exit_mutex; unsigned int futex_state; #endif #ifdef CONFIG_PERF_EVENTS u8 perf_recursion[PERF_NR_CONTEXTS]; struct perf_event_context *perf_event_ctxp; struct mutex perf_event_mutex; struct list_head perf_event_list; struct perf_ctx_data __rcu *perf_ctx_data; #endif #ifdef CONFIG_DEBUG_PREEMPT unsigned long preempt_disable_ip; #endif #ifdef CONFIG_NUMA /* Protected by alloc_lock: */ struct mempolicy *mempolicy; short il_prev; u8 il_weight; short pref_node_fork; #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; unsigned int numa_scan_period; unsigned int numa_scan_period_max; int numa_preferred_nid; unsigned long numa_migrate_retry; /* Migration stamp: */ u64 node_stamp; u64 last_task_numa_placement; u64 last_sum_exec_runtime; struct callback_head numa_work; /* * This pointer is only modified for current in syscall and * pagefault context (and for tasks being destroyed), so it can be read * from any of the following contexts: * - RCU read-side critical section * - current->numa_group from everywhere * - task's runqueue locked, task not running */ struct numa_group __rcu *numa_group; /* * numa_faults is an array split into four regions: * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer * in this precise order. * * faults_memory: Exponential decaying average of faults on a per-node * basis. Scheduling placement decisions are made based on these * counts. The values remain static for the duration of a PTE scan. * faults_cpu: Track the nodes the process was running on when a NUMA * hinting fault was incurred. * faults_memory_buffer and faults_cpu_buffer: Record faults per node * during the current scan window. When the scan completes, the counts * in faults_memory and faults_cpu decay and these values are copied. */ unsigned long *numa_faults; unsigned long total_numa_faults; /* * numa_faults_locality tracks if faults recorded during the last * scan window were remote/local or failed to migrate. The task scan * period is adapted based on the locality of the faults with different * weights depending on whether they were shared or private faults */ unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_RSEQ struct rseq __user *rseq; u32 rseq_len; u32 rseq_sig; /* * RmW on rseq_event_mask must be performed atomically * with respect to preemption. */ unsigned long rseq_event_mask; # ifdef CONFIG_DEBUG_RSEQ /* * This is a place holder to save a copy of the rseq fields for * validation of read-only fields. The struct rseq has a * variable-length array at the end, so it cannot be used * directly. Reserve a size large enough for the known fields. */ char rseq_fields[sizeof(struct rseq)]; # endif #endif #ifdef CONFIG_SCHED_MM_CID int mm_cid; /* Current cid in mm */ int last_mm_cid; /* Most recent cid in mm */ int migrate_from_cpu; int mm_cid_active; /* Whether cid bitmap is active */ struct callback_head cid_work; #endif struct tlbflush_unmap_batch tlb_ubc; /* Cache last used pipe for splice(): */ struct pipe_inode_info *splice_pipe; struct page_frag task_frag; #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info *delays; #endif #ifdef CONFIG_FAULT_INJECTION int make_it_fail; unsigned int fail_nth; #endif /* * When (nr_dirtied >= nr_dirtied_pause), it's time to call * balance_dirty_pages() for a dirty throttling pause: */ int nr_dirtied; int nr_dirtied_pause; /* Start of a write-and-pause period: */ unsigned long dirty_paused_when; #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif /* * Time slack values; these are used to round up poll() and * select() etc timeout values. These are in nanoseconds. */ u64 timer_slack_ns; u64 default_timer_slack_ns; #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) unsigned int kasan_depth; #endif #ifdef CONFIG_KCSAN struct kcsan_ctx kcsan_ctx; #ifdef CONFIG_TRACE_IRQFLAGS struct irqtrace_events kcsan_save_irqtrace; #endif #ifdef CONFIG_KCSAN_WEAK_MEMORY int kcsan_stack_depth; #endif #endif #ifdef CONFIG_KMSAN struct kmsan_ctx kmsan_ctx; #endif #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack: */ int curr_ret_stack; int curr_ret_depth; /* Stack of return addresses for return function tracing: */ unsigned long *ret_stack; /* Timestamp for last schedule: */ unsigned long long ftrace_timestamp; unsigned long long ftrace_sleeptime; /* * Number of functions that haven't been traced * because of depth overrun: */ atomic_t trace_overrun; /* Pause tracing: */ atomic_t tracing_graph_pause; #endif #ifdef CONFIG_TRACING /* Bitmask and counter of trace recursion: */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_KCOV /* See kernel/kcov.c for more details. */ /* Coverage collection mode enabled for this task (0 if disabled): */ unsigned int kcov_mode; /* Size of the kcov_area: */ unsigned int kcov_size; /* Buffer for coverage collection: */ void *kcov_area; /* KCOV descriptor wired with this task or NULL: */ struct kcov *kcov; /* KCOV common handle for remote coverage collection: */ u64 kcov_handle; /* KCOV sequence number: */ int kcov_sequence; /* Collect coverage from softirq context: */ unsigned int kcov_softirq; #endif #ifdef CONFIG_MEMCG_V1 struct mem_cgroup *memcg_in_oom; #endif #ifdef CONFIG_MEMCG /* Number of pages to reclaim on returning to userland: */ unsigned int memcg_nr_pages_over_high; /* Used by memcontrol for targeted memcg charge: */ struct mem_cgroup *active_memcg; /* Cache for current->cgroups->memcg->objcg lookups: */ struct obj_cgroup *objcg; #endif #ifdef CONFIG_BLK_CGROUP struct gendisk *throttle_disk; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; #endif #if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE) unsigned int sequential_io; unsigned int sequential_io_avg; #endif struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; # ifdef CONFIG_PREEMPT_RT unsigned long saved_state_change; # endif #endif struct rcu_head rcu; refcount_t rcu_users; int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ refcount_t stack_refcount; #endif #ifdef CONFIG_LIVEPATCH int patch_state; #endif #ifdef CONFIG_SECURITY /* Used by LSM modules for access restriction: */ void *security; #endif #ifdef CONFIG_BPF_SYSCALL /* Used by BPF task local storage */ struct bpf_local_storage __rcu *bpf_storage; /* Used for BPF run context */ struct bpf_run_ctx *bpf_ctx; #endif /* Used by BPF for per-TASK xdp storage */ struct bpf_net_context *bpf_net_context; #ifdef CONFIG_KSTACK_ERASE unsigned long lowest_stack; #endif #ifdef CONFIG_KSTACK_ERASE_METRICS unsigned long prev_lowest_stack; #endif #ifdef CONFIG_X86_MCE void __user *mce_vaddr; __u64 mce_kflags; u64 mce_addr; __u64 mce_ripv : 1, mce_whole_page : 1, __mce_reserved : 62; struct callback_head mce_kill_me; int mce_count; #endif #ifdef CONFIG_KRETPROBES struct llist_head kretprobe_instances; #endif #ifdef CONFIG_RETHOOK struct llist_head rethooks; #endif #ifdef CONFIG_ARCH_HAS_PARANOID_L1D_FLUSH /* * If L1D flush is supported on mm context switch * then we use this callback head to queue kill work * to kill tasks that are not running on SMT disabled * cores */ struct callback_head l1d_flush_kill; #endif #ifdef CONFIG_RV /* * Per-task RV monitor, fixed in CONFIG_RV_PER_TASK_MONITORS. * If memory becomes a concern, we can think about a dynamic method. */ union rv_task_monitor rv[CONFIG_RV_PER_TASK_MONITORS]; #endif #ifdef CONFIG_USER_EVENTS struct user_event_mm *user_event_mm; #endif #ifdef CONFIG_UNWIND_USER struct unwind_task_info unwind_info; #endif /* CPU-specific state of this task: */ struct thread_struct thread; /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. */ randomized_struct_fields_end } __attribute__ ((aligned (64))); #ifdef CONFIG_SCHED_PROXY_EXEC DECLARE_STATIC_KEY_TRUE(__sched_proxy_exec); static inline bool sched_proxy_exec(void) { return static_branch_likely(&__sched_proxy_exec); } #else static inline bool sched_proxy_exec(void) { return false; } #endif #define TASK_REPORT_IDLE (TASK_REPORT + 1) #define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1) static inline unsigned int __task_state_index(unsigned int tsk_state, unsigned int tsk_exit_state) { unsigned int state = (tsk_state | tsk_exit_state) & TASK_REPORT; BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX); if ((tsk_state & TASK_IDLE) == TASK_IDLE) state = TASK_REPORT_IDLE; /* * We're lying here, but rather than expose a completely new task state * to userspace, we can make this appear as if the task has gone through * a regular rt_mutex_lock() call. * Report frozen tasks as uninterruptible. */ if ((tsk_state & TASK_RTLOCK_WAIT) || (tsk_state & TASK_FROZEN)) state = TASK_UNINTERRUPTIBLE; return fls(state); } static inline unsigned int task_state_index(struct task_struct *tsk) { return __task_state_index(READ_ONCE(tsk->__state), tsk->exit_state); } static inline char task_index_to_char(unsigned int state) { static const char state_char[] = "RSDTtXZPI"; BUILD_BUG_ON(TASK_REPORT_MAX * 2 != 1 << (sizeof(state_char) - 1)); return state_char[state]; } static inline char task_state_to_char(struct task_struct *tsk) { return task_index_to_char(task_state_index(tsk)); } extern struct pid *cad_pid; /* * Per process flags */ #define PF_VCPU 0x00000001 /* I'm a virtual CPU */ #define PF_IDLE 0x00000002 /* I am an IDLE thread */ #define PF_EXITING 0x00000004 /* Getting shut down */ #define PF_POSTCOREDUMP 0x00000008 /* Coredumps should ignore this task */ #define PF_IO_WORKER 0x00000010 /* Task is an IO worker */ #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ #define PF_FORKNOEXEC 0x00000040 /* Forked but didn't exec */ #define PF_MCE_PROCESS 0x00000080 /* Process policy on mce errors */ #define PF_SUPERPRIV 0x00000100 /* Used super-user privileges */ #define PF_DUMPCORE 0x00000200 /* Dumped core */ #define PF_SIGNALED 0x00000400 /* Killed by a signal */ #define PF_MEMALLOC 0x00000800 /* Allocating memory to free memory. See memalloc_noreclaim_save() */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user() noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* If unset the fpu must be initialized before use */ #define PF_USER_WORKER 0x00004000 /* Kernel thread cloned from userspace thread */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_KCOMPACTD 0x00010000 /* I am kcompactd */ #define PF_KSWAPD 0x00020000 /* I am kswapd */ #define PF_MEMALLOC_NOFS 0x00040000 /* All allocations inherit GFP_NOFS. See memalloc_nfs_save() */ #define PF_MEMALLOC_NOIO 0x00080000 /* All allocations inherit GFP_NOIO. See memalloc_noio_save() */ #define PF_LOCAL_THROTTLE 0x00100000 /* Throttle writes only against the bdi I write to, * I am cleaning dirty pages from some other bdi. */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ #define PF__HOLE__00800000 0x00800000 #define PF__HOLE__01000000 0x01000000 #define PF__HOLE__02000000 0x02000000 #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocations constrained to zones which allow long term pinning. * See memalloc_pin_save() */ #define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other * tasks can access tsk->flags in readonly mode for example * with tsk_used_math (like during threaded core dumping). * There is however an exception to this rule during ptrace * or during fork: the ptracer task is allowed to write to the * child->flags of its traced child (same goes for fork, the parent * can write to the child->flags), because we're guaranteed the * child is not running and in turn not changing child->flags * at the same time the parent does it. */ #define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0) #define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0) #define clear_used_math() clear_stopped_child_used_math(current) #define set_used_math() set_stopped_child_used_math(current) #define conditional_stopped_child_used_math(condition, child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0) #define conditional_used_math(condition) conditional_stopped_child_used_math(condition, current) #define copy_to_stopped_child_used_math(child) \ do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0) /* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */ #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) static __always_inline bool is_percpu_thread(void) { return (current->flags & PF_NO_SETAFFINITY) && (current->nr_cpus_allowed == 1); } /* Per-process atomic flags. */ #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ #define PFA_SPEC_SSB_DISABLE 3 /* Speculative Store Bypass disabled */ #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ #define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ { return test_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_SET(name, func) \ static inline void task_set_##func(struct task_struct *p) \ { set_bit(PFA_##name, &p->atomic_flags); } #define TASK_PFA_CLEAR(name, func) \ static inline void task_clear_##func(struct task_struct *p) \ { clear_bit(PFA_##name, &p->atomic_flags); } TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) TASK_PFA_TEST(SPREAD_PAGE, spread_page) TASK_PFA_SET(SPREAD_PAGE, spread_page) TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) TASK_PFA_TEST(SPREAD_SLAB, spread_slab) TASK_PFA_SET(SPREAD_SLAB, spread_slab) TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_TEST(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_SET(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable) TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable) static inline void current_restore_flags(unsigned long orig_flags, unsigned long flags) { current->flags &= ~flags; current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int task_can_attach(struct task_struct *p); extern int dl_bw_alloc(int cpu, u64 dl_bw); extern void dl_bw_free(int cpu, u64 dl_bw); /* do_set_cpus_allowed() - consider using set_cpus_allowed_ptr() instead */ extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); /** * set_cpus_allowed_ptr - set CPU affinity mask of a task * @p: the task * @new_mask: CPU affinity mask * * Return: zero if successful, or a negative error code */ extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask); extern int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node); extern void release_user_cpus_ptr(struct task_struct *p); extern int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask); extern void force_compatible_cpus_allowed_ptr(struct task_struct *p); extern void relax_compatible_cpus_allowed_ptr(struct task_struct *p); extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); /** * task_nice - return the nice value of a given task. * @p: the task in question. * * Return: The nice value [ -20 ... 0 ... 19 ]. */ static inline int task_nice(const struct task_struct *p) { return PRIO_TO_NICE((p)->static_prio); } extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern void sched_set_fifo(struct task_struct *p); extern void sched_set_fifo_low(struct task_struct *p); extern void sched_set_normal(struct task_struct *p, int nice); extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? * @p: the task in question. * * Return: 1 if @p is an idle task. 0 otherwise. */ static __always_inline bool is_idle_task(const struct task_struct *p) { return !!(p->flags & PF_IDLE); } extern struct task_struct *curr_task(int cpu); extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { struct task_struct task; #ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; #endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; #ifndef CONFIG_THREAD_INFO_IN_TASK extern struct thread_info init_thread_info; #endif extern unsigned long init_stack[THREAD_SIZE / sizeof(unsigned long)]; #ifdef CONFIG_THREAD_INFO_IN_TASK # define task_thread_info(task) (&(task)->thread_info) #else # define task_thread_info(task) ((struct thread_info *)(task)->stack) #endif /* * find a task by one of its numerical ids * * find_task_by_pid_ns(): * finds a task by its pid in the specified namespace * find_task_by_vpid(): * finds a task by its virtual pid * * see also find_vpid() etc in include/linux/pid.h */ extern struct task_struct *find_task_by_vpid(pid_t nr); extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); /* * find a task by its virtual pid and get the task struct */ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); extern void kick_process(struct task_struct *tsk); extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); #define set_task_comm(tsk, from) ({ \ BUILD_BUG_ON(sizeof(from) != TASK_COMM_LEN); \ __set_task_comm(tsk, from, false); \ }) /* * - Why not use task_lock()? * User space can randomly change their names anyway, so locking for readers * doesn't make sense. For writers, locking is probably necessary, as a race * condition could lead to long-term mixed results. * The strscpy_pad() in __set_task_comm() can ensure that the task comm is * always NUL-terminated and zero-padded. Therefore the race condition between * reader and writer is not an issue. * * - BUILD_BUG_ON() can help prevent the buf from being truncated. * Since the callers don't perform any return value checks, this safeguard is * necessary. */ #define get_task_comm(buf, tsk) ({ \ BUILD_BUG_ON(sizeof(buf) < TASK_COMM_LEN); \ strscpy_pad(buf, (tsk)->comm); \ buf; \ }) static __always_inline void scheduler_ipi(void) { /* * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting * TIF_NEED_RESCHED remotely (for the first time) will also send * this IPI. */ preempt_fold_need_resched(); } extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state); /* * Set thread flags in other task's structures. * See asm/thread_info.h for TIF_xxxx flags available: */ static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag) { set_ti_thread_flag(task_thread_info(tsk), flag); } static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag) { clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline void update_tsk_thread_flag(struct task_struct *tsk, int flag, bool value) { update_ti_thread_flag(task_thread_info(tsk), flag, value); } static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_set_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag); } static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) { return test_ti_thread_flag(task_thread_info(tsk), flag); } static inline void set_tsk_need_resched(struct task_struct *tsk) { if (tracepoint_enabled(sched_set_need_resched_tp) && !test_tsk_thread_flag(tsk, TIF_NEED_RESCHED)) __trace_set_need_resched(tsk, TIF_NEED_RESCHED); set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); } static inline void clear_tsk_need_resched(struct task_struct *tsk) { atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, (atomic_long_t *)&task_thread_info(tsk)->flags); } static inline int test_tsk_need_resched(struct task_struct *tsk) { return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ #if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) extern int __cond_resched(void); #if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) { return static_call_mod(cond_resched)(); } #elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) extern int dynamic_cond_resched(void); static __always_inline int _cond_resched(void) { return dynamic_cond_resched(); } #else /* !CONFIG_PREEMPTION */ static inline int _cond_resched(void) { return __cond_resched(); } #endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */ #else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */ static inline int _cond_resched(void) { return 0; } #endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */ #define cond_resched() ({ \ __might_resched(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock); extern int __cond_resched_rwlock_read(rwlock_t *lock); extern int __cond_resched_rwlock_write(rwlock_t *lock); #define MIGHT_RESCHED_RCU_SHIFT 8 #define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) #ifndef CONFIG_PREEMPT_RT /* * Non RT kernels have an elevated preempt count due to the held lock, * but are not allowed to be inside a RCU read side critical section */ # define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET #else /* * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in * cond_resched*lock() has to take that into account because it checks for * preempt_count() and rcu_preempt_depth(). */ # define PREEMPT_LOCK_RESCHED_OFFSETS \ (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) #endif #define cond_resched_lock(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_lock(lock); \ }) #define cond_resched_rwlock_read(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_read(lock); \ }) #define cond_resched_rwlock_write(lock) ({ \ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ __cond_resched_rwlock_write(lock); \ }) #ifndef CONFIG_PREEMPT_RT static inline struct mutex *__get_task_blocked_on(struct task_struct *p) { struct mutex *m = p->blocked_on; if (m) lockdep_assert_held_once(&m->wait_lock); return m; } static inline void __set_task_blocked_on(struct task_struct *p, struct mutex *m) { struct mutex *blocked_on = READ_ONCE(p->blocked_on); WARN_ON_ONCE(!m); /* The task should only be setting itself as blocked */ WARN_ON_ONCE(p != current); /* Currently we serialize blocked_on under the mutex::wait_lock */ lockdep_assert_held_once(&m->wait_lock); /* * Check ensure we don't overwrite existing mutex value * with a different mutex. Note, setting it to the same * lock repeatedly is ok. */ WARN_ON_ONCE(blocked_on && blocked_on != m); WRITE_ONCE(p->blocked_on, m); } static inline void set_task_blocked_on(struct task_struct *p, struct mutex *m) { guard(raw_spinlock_irqsave)(&m->wait_lock); __set_task_blocked_on(p, m); } static inline void __clear_task_blocked_on(struct task_struct *p, struct mutex *m) { if (m) { struct mutex *blocked_on = READ_ONCE(p->blocked_on); /* Currently we serialize blocked_on under the mutex::wait_lock */ lockdep_assert_held_once(&m->wait_lock); /* * There may be cases where we re-clear already cleared * blocked_on relationships, but make sure we are not * clearing the relationship with a different lock. */ WARN_ON_ONCE(blocked_on && blocked_on != m); } WRITE_ONCE(p->blocked_on, NULL); } static inline void clear_task_blocked_on(struct task_struct *p, struct mutex *m) { guard(raw_spinlock_irqsave)(&m->wait_lock); __clear_task_blocked_on(p, m); } #else static inline void __clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } static inline void clear_task_blocked_on(struct task_struct *p, struct rt_mutex *m) { } #endif /* !CONFIG_PREEMPT_RT */ static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); } /* * Wrappers for p->thread_info->cpu access. No-op on UP. */ #ifdef CONFIG_SMP static inline unsigned int task_cpu(const struct task_struct *p) { return READ_ONCE(task_thread_info(p)->cpu); } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); #else static inline unsigned int task_cpu(const struct task_struct *p) { return 0; } static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) { } #endif /* CONFIG_SMP */ static inline bool task_is_runnable(struct task_struct *p) { return p->on_rq && !p->se.sched_delayed; } extern bool sched_task_on_rq(struct task_struct *p); extern unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_curr_snapshot(int cpu); /* * In order to reduce various lock holder preemption latencies provide an * interface to see if a vCPU is currently running or not. * * This allows us to terminate optimistic spin loops and block, analogous to * the native optimistic spin heuristic of testing if the lock owner task is * running or not. */ #ifndef vcpu_is_preempted static inline bool vcpu_is_preempted(int cpu) { return false; } #endif extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #ifndef TASK_SIZE_OF #define TASK_SIZE_OF(tsk) TASK_SIZE #endif static inline bool owner_on_cpu(struct task_struct *owner) { /* * As lock holder preemption issue, we both skip spinning if * task is not on cpu or its cpu is preempted */ return READ_ONCE(owner->on_cpu) && !vcpu_is_preempted(task_cpu(owner)); } /* Returns effective CPU energy utilization, as seen by the scheduler */ unsigned long sched_cpu_util(int cpu); #ifdef CONFIG_SCHED_CORE extern void sched_core_free(struct task_struct *tsk); extern void sched_core_fork(struct task_struct *p); extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type, unsigned long uaddr); extern int sched_core_idle_cpu(int cpu); #else static inline void sched_core_free(struct task_struct *tsk) { } static inline void sched_core_fork(struct task_struct *p) { } static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } #endif extern void sched_set_stop_task(int cpu, struct task_struct *stop); #ifdef CONFIG_MEM_ALLOC_PROFILING static __always_inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { swap(current->alloc_tag, tag); return tag; } static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) { #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); #endif current->alloc_tag = old; } #else #define alloc_tag_save(_tag) NULL #define alloc_tag_restore(_tag, _old) do {} while (0) #endif #ifndef MODULE #ifndef COMPILE_OFFSETS extern void ___migrate_enable(void); struct rq; DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); /* * The "struct rq" is not available here, so we can't access the * "runqueues" with this_cpu_ptr(), as the compilation will fail in * this_cpu_ptr() -> raw_cpu_ptr() -> __verify_pcpu_ptr(): * typeof((ptr) + 0) * * So use arch_raw_cpu_ptr()/PERCPU_PTR() directly here. */ #ifdef CONFIG_SMP #define this_rq_raw() arch_raw_cpu_ptr(&runqueues) #else #define this_rq_raw() PERCPU_PTR(&runqueues) #endif #define this_rq_pinned() (*(unsigned int *)((void *)this_rq_raw() + RQ_nr_pinned)) static inline void __migrate_enable(void) { struct task_struct *p = current; #ifdef CONFIG_DEBUG_PREEMPT /* * Check both overflow from migrate_disable() and superfluous * migrate_enable(). */ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) return; #endif if (p->migration_disabled > 1) { p->migration_disabled--; return; } /* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). */ guard(preempt)(); if (unlikely(p->cpus_ptr != &p->cpus_mask)) ___migrate_enable(); /* * Mustn't clear migration_disabled() until cpus_ptr points back at the * regular cpus_mask, otherwise things that race (eg. * select_fallback_rq) get confused. */ barrier(); p->migration_disabled = 0; this_rq_pinned()--; } static inline void __migrate_disable(void) { struct task_struct *p = current; if (p->migration_disabled) { #ifdef CONFIG_DEBUG_PREEMPT /* *Warn about overflow half-way through the range. */ WARN_ON_ONCE((s16)p->migration_disabled < 0); #endif p->migration_disabled++; return; } guard(preempt)(); this_rq_pinned()++; p->migration_disabled = 1; } #else /* !COMPILE_OFFSETS */ static inline void __migrate_disable(void) { } static inline void __migrate_enable(void) { } #endif /* !COMPILE_OFFSETS */ /* * So that it is possible to not export the runqueues variable, define and * export migrate_enable/migrate_disable in kernel/sched/core.c too, and use * them for the modules. The macro "INSTANTIATE_EXPORTED_MIGRATE_DISABLE" will * be defined in kernel/sched/core.c. */ #ifndef INSTANTIATE_EXPORTED_MIGRATE_DISABLE static inline void migrate_disable(void) { __migrate_disable(); } static inline void migrate_enable(void) { __migrate_enable(); } #else /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ extern void migrate_disable(void); extern void migrate_enable(void); #endif /* INSTANTIATE_EXPORTED_MIGRATE_DISABLE */ #else /* MODULE */ extern void migrate_disable(void); extern void migrate_enable(void); #endif /* MODULE */ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) #endif
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 // SPDX-License-Identifier: GPL-2.0-or-later /* * xt_connmark - Netfilter module to operate on connection marks * * Copyright (C) 2002,2004 MARA Systems AB <https://www.marasystems.com> * by Henrik Nordstrom <hno@marasystems.com> * Copyright © CC Computer Consultants GmbH, 2007 - 2008 * Jan Engelhardt <jengelh@medozas.de> */ #include <linux/module.h> #include <linux/skbuff.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_connmark.h> MODULE_AUTHOR("Henrik Nordstrom <hno@marasystems.com>"); MODULE_DESCRIPTION("Xtables: connection mark operations"); MODULE_LICENSE("GPL"); MODULE_ALIAS("ipt_CONNMARK"); MODULE_ALIAS("ip6t_CONNMARK"); MODULE_ALIAS("ipt_connmark"); MODULE_ALIAS("ip6t_connmark"); static unsigned int connmark_tg_shift(struct sk_buff *skb, const struct xt_connmark_tginfo2 *info) { enum ip_conntrack_info ctinfo; u_int32_t new_targetmark; struct nf_conn *ct; u_int32_t newmark; u_int32_t oldmark; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return XT_CONTINUE; switch (info->mode) { case XT_CONNMARK_SET: oldmark = READ_ONCE(ct->mark); newmark = (oldmark & ~info->ctmask) ^ info->ctmark; if (info->shift_dir == D_SHIFT_RIGHT) newmark >>= info->shift_bits; else newmark <<= info->shift_bits; if (READ_ONCE(ct->mark) != newmark) { WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_SAVE: new_targetmark = (skb->mark & info->nfmask); if (info->shift_dir == D_SHIFT_RIGHT) new_targetmark >>= info->shift_bits; else new_targetmark <<= info->shift_bits; newmark = (READ_ONCE(ct->mark) & ~info->ctmask) ^ new_targetmark; if (READ_ONCE(ct->mark) != newmark) { WRITE_ONCE(ct->mark, newmark); nf_conntrack_event_cache(IPCT_MARK, ct); } break; case XT_CONNMARK_RESTORE: new_targetmark = (READ_ONCE(ct->mark) & info->ctmask); if (info->shift_dir == D_SHIFT_RIGHT) new_targetmark >>= info->shift_bits; else new_targetmark <<= info->shift_bits; newmark = (skb->mark & ~info->nfmask) ^ new_targetmark; skb->mark = newmark; break; } return XT_CONTINUE; } static unsigned int connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_connmark_tginfo1 *info = par->targinfo; const struct xt_connmark_tginfo2 info2 = { .ctmark = info->ctmark, .ctmask = info->ctmask, .nfmask = info->nfmask, .mode = info->mode, }; return connmark_tg_shift(skb, &info2); } static unsigned int connmark_tg_v2(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_connmark_tginfo2 *info = par->targinfo; return connmark_tg_shift(skb, info); } static int connmark_tg_check(const struct xt_tgchk_param *par) { int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void connmark_tg_destroy(const struct xt_tgdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static bool connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_connmark_mtinfo1 *info = par->matchinfo; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return false; return ((READ_ONCE(ct->mark) & info->mask) == info->mark) ^ info->invert; } static int connmark_mt_check(const struct xt_mtchk_param *par) { int ret; ret = nf_ct_netns_get(par->net, par->family); if (ret < 0) pr_info_ratelimited("cannot load conntrack support for proto=%u\n", par->family); return ret; } static void connmark_mt_destroy(const struct xt_mtdtor_param *par) { nf_ct_netns_put(par->net, par->family); } static struct xt_target connmark_tg_reg[] __read_mostly = { { .name = "CONNMARK", .revision = 1, .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), .destroy = connmark_tg_destroy, .me = THIS_MODULE, }, { .name = "CONNMARK", .revision = 2, .family = NFPROTO_IPV4, .checkentry = connmark_tg_check, .target = connmark_tg_v2, .targetsize = sizeof(struct xt_connmark_tginfo2), .destroy = connmark_tg_destroy, .me = THIS_MODULE, }, #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) { .name = "CONNMARK", .revision = 1, .family = NFPROTO_IPV6, .checkentry = connmark_tg_check, .target = connmark_tg, .targetsize = sizeof(struct xt_connmark_tginfo1), .destroy = connmark_tg_destroy, .me = THIS_MODULE, }, { .name = "CONNMARK", .revision = 2, .family = NFPROTO_IPV6, .checkentry = connmark_tg_check, .target = connmark_tg_v2, .targetsize = sizeof(struct xt_connmark_tginfo2), .destroy = connmark_tg_destroy, .me = THIS_MODULE, }, #endif }; static struct xt_match connmark_mt_reg __read_mostly = { .name = "connmark", .revision = 1, .family = NFPROTO_UNSPEC, .checkentry = connmark_mt_check, .match = connmark_mt, .matchsize = sizeof(struct xt_connmark_mtinfo1), .destroy = connmark_mt_destroy, .me = THIS_MODULE, }; static int __init connmark_mt_init(void) { int ret; ret = xt_register_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); if (ret < 0) return ret; ret = xt_register_match(&connmark_mt_reg); if (ret < 0) { xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); return ret; } return 0; } static void __exit connmark_mt_exit(void) { xt_unregister_match(&connmark_mt_reg); xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg)); } module_init(connmark_mt_init); module_exit(connmark_mt_exit);
30 9468 74 7478 617 4 722 9395 7482 7478 8100 3549 1 158 1 400 120 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_CPUMASK_H #define __LINUX_CPUMASK_H /* * Cpumasks provide a bitmap suitable for representing the * set of CPUs in a system, one bit position per CPU number. In general, * only nr_cpu_ids (<= NR_CPUS) bits are valid. */ #include <linux/cleanup.h> #include <linux/kernel.h> #include <linux/bitmap.h> #include <linux/cpumask_types.h> #include <linux/atomic.h> #include <linux/bug.h> #include <linux/gfp_types.h> #include <linux/numa.h> /** * cpumask_pr_args - printf args to output a cpumask * @maskp: cpumask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a cpumask. */ #define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) #define nr_cpu_ids ((unsigned int)NR_CPUS) #else extern unsigned int nr_cpu_ids; #endif static __always_inline void set_nr_cpu_ids(unsigned int nr) { #if (NR_CPUS == 1) || defined(CONFIG_FORCE_NR_CPUS) WARN_ON(nr != nr_cpu_ids); #else nr_cpu_ids = nr; #endif } /* * We have several different "preferred sizes" for the cpumask * operations, depending on operation. * * For example, the bitmap scanning and operating operations have * optimized routines that work for the single-word case, but only when * the size is constant. So if NR_CPUS fits in one single word, we are * better off using that small constant, in order to trigger the * optimized bit finding. That is 'small_cpumask_size'. * * The clearing and copying operations will similarly perform better * with a constant size, but we limit that size arbitrarily to four * words. We call this 'large_cpumask_size'. * * Finally, some operations just want the exact limit, either because * they set bits or just don't have any faster fixed-sized versions. We * call this just 'nr_cpumask_bits'. * * Note that these optional constants are always guaranteed to be at * least as big as 'nr_cpu_ids' itself is, and all our cpumask * allocations are at least that size (see cpumask_size()). The * optimization comes from being able to potentially use a compile-time * constant instead of a run-time generated exact number of CPUs. */ #if NR_CPUS <= BITS_PER_LONG #define small_cpumask_bits ((unsigned int)NR_CPUS) #define large_cpumask_bits ((unsigned int)NR_CPUS) #elif NR_CPUS <= 4*BITS_PER_LONG #define small_cpumask_bits nr_cpu_ids #define large_cpumask_bits ((unsigned int)NR_CPUS) #else #define small_cpumask_bits nr_cpu_ids #define large_cpumask_bits nr_cpu_ids #endif #define nr_cpumask_bits nr_cpu_ids /* * The following particular system cpumasks and operations manage * possible, present, active and online cpus. * * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_enabled_mask - has bit 'cpu' set iff cpu can be brought online * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * * The cpu_possible_mask is fixed at boot time, as the set of CPU IDs * that it is possible might ever be plugged in at anytime during the * life of that system boot. The cpu_present_mask is dynamic(*), * representing which CPUs are currently plugged in. And * cpu_online_mask is the dynamic subset of cpu_present_mask, * indicating those CPUs available for scheduling. * * If HOTPLUG is enabled, then cpu_present_mask varies dynamically, * depending on what ACPI reports as currently plugged in, otherwise * cpu_present_mask is just a copy of cpu_possible_mask. * * (*) Well, cpu_present_mask is dynamic in the hotplug case. If not * hotplug, it's a copy of cpu_possible_mask, hence fixed at boot. * * Subtleties: * 1) UP ARCHes (NR_CPUS == 1, CONFIG_SMP not defined) hardcode * assumption that their single CPU is online. The UP * cpu_{online,possible,present}_masks are placebos. Changing them * will have no useful affect on the following num_*_cpus() * and cpu_*() macros in the UP case. This ugliness is a UP * optimization - don't waste any instructions or memory references * asking if you're online or how many CPUs there are if there is * only one CPU. */ extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_enabled_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; extern struct cpumask __cpu_dying_mask; #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) #define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) #define cpu_enabled_mask ((const struct cpumask *)&__cpu_enabled_mask) #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) #define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask) extern atomic_t __num_online_cpus; extern cpumask_t cpus_booted_once_mask; static __always_inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS WARN_ON_ONCE(cpu >= bits); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ } /* verify cpu argument to cpumask_* operators */ static __always_inline unsigned int cpumask_check(unsigned int cpu) { cpu_max_bits_warn(cpu, small_cpumask_bits); return cpu; } /** * cpumask_first - get the first cpu in a cpumask * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_first(const struct cpumask *srcp) { return find_first_bit(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_first_zero - get the first unset cpu in a cpumask * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if all cpus are set. */ static __always_inline unsigned int cpumask_first_zero(const struct cpumask *srcp) { return find_first_zero_bit(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 * @srcp1: the first input * @srcp2: the second input * * Return: >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). */ static __always_inline unsigned int cpumask_first_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_first_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_first_andnot - return the first cpu from *srcp1 & ~*srcp2 * @srcp1: the first input * @srcp2: the second input * * Return: >= nr_cpu_ids if no such cpu found. */ static __always_inline unsigned int cpumask_first_andnot(const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_first_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_first_and_and - return the first cpu from *srcp1 & *srcp2 & *srcp3 * @srcp1: the first input * @srcp2: the second input * @srcp3: the third input * * Return: >= nr_cpu_ids if no cpus set in all. */ static __always_inline unsigned int cpumask_first_and_and(const struct cpumask *srcp1, const struct cpumask *srcp2, const struct cpumask *srcp3) { return find_first_and_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), cpumask_bits(srcp3), small_cpumask_bits); } /** * cpumask_last - get the last CPU in a cpumask * @srcp: - the cpumask pointer * * Return: >= nr_cpumask_bits if no CPUs set. */ static __always_inline unsigned int cpumask_last(const struct cpumask *srcp) { return find_last_bit(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_next - get the next cpu in a cpumask * @n: the cpu prior to the place to search (i.e. return will be > @n) * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if no further cpus set. */ static __always_inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_bit(cpumask_bits(srcp), small_cpumask_bits, n + 1); } /** * cpumask_next_zero - get the next unset cpu in a cpumask * @n: the cpu prior to the place to search (i.e. return will be > @n) * @srcp: the cpumask pointer * * Return: >= nr_cpu_ids if no further cpus unset. */ static __always_inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_zero_bit(cpumask_bits(srcp), small_cpumask_bits, n+1); } #if NR_CPUS == 1 /* Uniprocessor: there is only one valid CPU */ static __always_inline unsigned int cpumask_local_spread(unsigned int i, int node) { return 0; } static __always_inline unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { return cpumask_first_and(src1p, src2p); } static __always_inline unsigned int cpumask_any_distribute(const struct cpumask *srcp) { return cpumask_first(srcp); } #else unsigned int cpumask_local_spread(unsigned int i, int node); unsigned int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); unsigned int cpumask_any_distribute(const struct cpumask *srcp); #endif /* NR_CPUS */ /** * cpumask_next_and - get the next cpu in *src1p & *src2p * @n: the cpu prior to the place to search (i.e. return will be > @n) * @src1p: the first cpumask pointer * @src2p: the second cpumask pointer * * Return: >= nr_cpu_ids if no further cpus set in both. */ static __always_inline unsigned int cpumask_next_and(int n, const struct cpumask *src1p, const struct cpumask *src2p) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_and_bit(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits, n + 1); } /** * cpumask_next_andnot - get the next cpu in *src1p & ~*src2p * @n: the cpu prior to the place to search (i.e. return will be > @n) * @src1p: the first cpumask pointer * @src2p: the second cpumask pointer * * Return: >= nr_cpu_ids if no further cpus set in both. */ static __always_inline unsigned int cpumask_next_andnot(int n, const struct cpumask *src1p, const struct cpumask *src2p) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_andnot_bit(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits, n + 1); } /** * cpumask_next_and_wrap - get the next cpu in *src1p & *src2p, starting from * @n+1. If nothing found, wrap around and start from * the beginning * @n: the cpu prior to the place to search (i.e. search starts from @n+1) * @src1p: the first cpumask pointer * @src2p: the second cpumask pointer * * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src1p & @src2p is empty. */ static __always_inline unsigned int cpumask_next_and_wrap(int n, const struct cpumask *src1p, const struct cpumask *src2p) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_and_bit_wrap(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits, n + 1); } /** * cpumask_next_wrap - get the next cpu in *src, starting from @n+1. If nothing * found, wrap around and start from the beginning * @n: the cpu prior to the place to search (i.e. search starts from @n+1) * @src: cpumask pointer * * Return: next set bit, wrapped if needed, or >= nr_cpu_ids if @src is empty. */ static __always_inline unsigned int cpumask_next_wrap(int n, const struct cpumask *src) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_bit_wrap(cpumask_bits(src), small_cpumask_bits, n + 1); } /** * cpumask_random - get random cpu in *src. * @src: cpumask pointer * * Return: random set bit, or >= nr_cpu_ids if @src is empty. */ static __always_inline unsigned int cpumask_random(const struct cpumask *src) { return find_random_bit(cpumask_bits(src), nr_cpu_ids); } /** * for_each_cpu - iterate over every cpu in a mask * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu(cpu, mask) \ for_each_set_bit(cpu, cpumask_bits(mask), small_cpumask_bits) /** * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * @start: the start location * * The implementation does not assume any bit in @mask is set (including @start). * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_wrap(cpu, mask, start) \ for_each_set_bit_wrap(cpu, cpumask_bits(mask), small_cpumask_bits, start) /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_and(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_and(cpu, mask1, mask2) \ for_each_and_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) /** * for_each_cpu_andnot - iterate over every cpu present in one mask, excluding * those present in another. * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_andnot(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_andnot(cpu, mask1, mask2) \ for_each_andnot_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) /** * for_each_cpu_or - iterate over every cpu present in either mask * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_or(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_or(cpu, mask1, mask2) \ for_each_or_bit(cpu, cpumask_bits(mask1), cpumask_bits(mask2), small_cpumask_bits) /** * for_each_cpu_from - iterate over CPUs present in @mask, from @cpu to the end of @mask. * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_from(cpu, mask) \ for_each_set_bit_from(cpu, cpumask_bits(mask), small_cpumask_bits) /** * cpumask_any_but - return an arbitrary cpu in a cpumask, but not this one. * @mask: the cpumask to search * @cpu: the cpu to ignore. * * Often used to find any cpu but smp_processor_id() in a mask. * If @cpu == -1, the function is equivalent to cpumask_any(). * Return: >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_any_but(const struct cpumask *mask, int cpu) { unsigned int i; /* -1 is a legal arg here. */ if (cpu != -1) cpumask_check(cpu); for_each_cpu(i, mask) if (i != cpu) break; return i; } /** * cpumask_any_and_but - pick an arbitrary cpu from *mask1 & *mask2, but not this one. * @mask1: the first input cpumask * @mask2: the second input cpumask * @cpu: the cpu to ignore * * If @cpu == -1, the function is equivalent to cpumask_any_and(). * Returns >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_any_and_but(const struct cpumask *mask1, const struct cpumask *mask2, int cpu) { unsigned int i; /* -1 is a legal arg here. */ if (cpu != -1) cpumask_check(cpu); i = cpumask_first_and(mask1, mask2); if (i != cpu) return i; return cpumask_next_and(cpu, mask1, mask2); } /** * cpumask_any_andnot_but - pick an arbitrary cpu from *mask1 & ~*mask2, but not this one. * @mask1: the first input cpumask * @mask2: the second input cpumask * @cpu: the cpu to ignore * * If @cpu == -1, the function returns the first matching cpu. * Returns >= nr_cpu_ids if no cpus set. */ static __always_inline unsigned int cpumask_any_andnot_but(const struct cpumask *mask1, const struct cpumask *mask2, int cpu) { unsigned int i; /* -1 is a legal arg here. */ if (cpu != -1) cpumask_check(cpu); i = cpumask_first_andnot(mask1, mask2); if (i != cpu) return i; return cpumask_next_andnot(cpu, mask1, mask2); } /** * cpumask_nth - get the Nth cpu in a cpumask * @srcp: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth(unsigned int cpu, const struct cpumask *srcp) { return find_nth_bit(cpumask_bits(srcp), small_cpumask_bits, cpumask_check(cpu)); } /** * cpumask_nth_and - get the Nth cpu in 2 cpumasks * @srcp1: the cpumask pointer * @srcp2: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth_and(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2) { return find_nth_and_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits, cpumask_check(cpu)); } /** * cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd. * @srcp1: the cpumask pointer * @srcp2: the cpumask pointer * @srcp3: the cpumask pointer * @cpu: the Nth cpu to find, starting from 0 * * Return: >= nr_cpu_ids if such cpu doesn't exist. */ static __always_inline unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1, const struct cpumask *srcp2, const struct cpumask *srcp3) { return find_nth_and_andnot_bit(cpumask_bits(srcp1), cpumask_bits(srcp2), cpumask_bits(srcp3), small_cpumask_bits, cpumask_check(cpu)); } #define CPU_BITS_NONE \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } #define CPU_BITS_CPU0 \ { \ [0] = 1UL \ } /** * cpumask_set_cpu - set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static __always_inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static __always_inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_clear_cpus - clear cpus in a cpumask * @dstp: the cpumask pointer * @cpu: cpu number (< nr_cpu_ids) * @ncpus: number of cpus to clear (< nr_cpu_ids) */ static __always_inline void cpumask_clear_cpus(struct cpumask *dstp, unsigned int cpu, unsigned int ncpus) { cpumask_check(cpu + ncpus - 1); bitmap_clear(cpumask_bits(dstp), cpumask_check(cpu), ncpus); } /** * cpumask_clear_cpu - clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static __always_inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) { clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static __always_inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) { __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Return: true if @cpu is set in @cpumask, else returns false */ static __always_inline bool cpumask_test_cpu(int cpu, const struct cpumask *cpumask) { return test_bit(cpumask_check(cpu), cpumask_bits((cpumask))); } /** * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * test_and_set_bit wrapper for cpumasks. * * Return: true if @cpu is set in old bitmap of @cpumask, else returns false */ static __always_inline bool cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask) { return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * test_and_clear_bit wrapper for cpumasks. * * Return: true if @cpu is set in old bitmap of @cpumask, else returns false */ static __always_inline bool cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask) { return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static __always_inline void cpumask_setall(struct cpumask *dstp) { if (small_const_nbits(small_cpumask_bits)) { cpumask_bits(dstp)[0] = BITMAP_LAST_WORD_MASK(nr_cpumask_bits); return; } bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static __always_inline void cpumask_clear(struct cpumask *dstp) { bitmap_zero(cpumask_bits(dstp), large_cpumask_bits); } /** * cpumask_and - *dstp = *src1p & *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * Return: false if *@dstp is empty, else returns true */ static __always_inline bool cpumask_and(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_or - *dstp = *src1p | *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static __always_inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_xor - *dstp = *src1p ^ *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static __always_inline void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_andnot - *dstp = *src1p & ~*src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * Return: false if *@dstp is empty, else returns true */ static __always_inline bool cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_equal - *src1p == *src2p * @src1p: the first input * @src2p: the second input * * Return: true if the cpumasks are equal, false if not */ static __always_inline bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_or_equal - *src1p | *src2p == *src3p * @src1p: the first input * @src2p: the second input * @src3p: the third input * * Return: true if first cpumask ORed with second cpumask == third cpumask, * otherwise false */ static __always_inline bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p, const struct cpumask *src3p) { return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p), cpumask_bits(src3p), small_cpumask_bits); } /** * cpumask_intersects - (*src1p & *src2p) != 0 * @src1p: the first input * @src2p: the second input * * Return: true if first cpumask ANDed with second cpumask is non-empty, * otherwise false */ static __always_inline bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_subset - (*src1p & ~*src2p) == 0 * @src1p: the first input * @src2p: the second input * * Return: true if *@src1p is a subset of *@src2p, else returns false */ static __always_inline bool cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), small_cpumask_bits); } /** * cpumask_empty - *srcp == 0 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. * * Return: true if srcp is empty (has no bits set), else false */ static __always_inline bool cpumask_empty(const struct cpumask *srcp) { return bitmap_empty(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_full - *srcp == 0xFFFFFFFF... * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. * * Return: true if srcp is full (has all bits set), else false */ static __always_inline bool cpumask_full(const struct cpumask *srcp) { return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_weight - Count of bits in *srcp * @srcp: the cpumask to count bits (< nr_cpu_ids) in. * * Return: count of bits set in *srcp */ static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp) { return bitmap_weight(cpumask_bits(srcp), small_cpumask_bits); } /** * cpumask_weight_and - Count of bits in (*srcp1 & *srcp2) * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. * * Return: count of bits set in both *srcp1 and *srcp2 */ static __always_inline unsigned int cpumask_weight_and(const struct cpumask *srcp1, const struct cpumask *srcp2) { return bitmap_weight_and(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_weight_andnot - Count of bits in (*srcp1 & ~*srcp2) * @srcp1: the cpumask to count bits (< nr_cpu_ids) in. * @srcp2: the cpumask to count bits (< nr_cpu_ids) in. * * Return: count of bits set in both *srcp1 and *srcp2 */ static __always_inline unsigned int cpumask_weight_andnot(const struct cpumask *srcp1, const struct cpumask *srcp2) { return bitmap_weight_andnot(cpumask_bits(srcp1), cpumask_bits(srcp2), small_cpumask_bits); } /** * cpumask_shift_right - *dstp = *srcp >> n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static __always_inline void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, small_cpumask_bits); } /** * cpumask_shift_left - *dstp = *srcp << n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static __always_inline void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, nr_cpumask_bits); } /** * cpumask_copy - *dstp = *srcp * @dstp: the result * @srcp: the input cpumask */ static __always_inline void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), large_cpumask_bits); } /** * cpumask_any - pick an arbitrary cpu from *srcp * @srcp: the input cpumask * * Return: >= nr_cpu_ids if no cpus set. */ #define cpumask_any(srcp) cpumask_first(srcp) /** * cpumask_any_and - pick an arbitrary cpu from *mask1 & *mask2 * @mask1: the first input cpumask * @mask2: the second input cpumask * * Return: >= nr_cpu_ids if no cpus set. */ #define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2)) /** * cpumask_of - the cpumask containing just a given cpu * @cpu: the cpu (<= nr_cpu_ids) */ #define cpumask_of(cpu) (get_cpu_mask(cpu)) /** * cpumask_parse_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parselist_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parselist_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parse - extract a cpumask from a string * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpumask_parse(const char *buf, struct cpumask *dstp) { return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpulist_parse - extract a cpumask from a user string of ranges * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Return: -errno, or 0 for success. */ static __always_inline int cpulist_parse(const char *buf, struct cpumask *dstp) { return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_size - calculate size to allocate for a 'struct cpumask' in bytes * * Return: size to allocate for a &struct cpumask in bytes */ static __always_inline unsigned int cpumask_size(void) { return bitmap_size(large_cpumask_bits); } #ifdef CONFIG_CPUMASK_OFFSTACK #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return alloc_cpumask_var_node(mask, flags | __GFP_ZERO, node); } /** * alloc_cpumask_var - allocate a struct cpumask * @mask: pointer to cpumask_var_t where the cpumask is returned * @flags: GFP_ flags * * Only defined when CONFIG_CPUMASK_OFFSTACK=y, otherwise is * a nop returning a constant 1 (in <linux/cpumask.h>). * * See alloc_cpumask_var_node. * * Return: %true if allocation succeeded, %false if not */ static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var_node(mask, flags, NUMA_NO_NODE); } static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return alloc_cpumask_var(mask, flags | __GFP_ZERO); } void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); static __always_inline bool cpumask_available(cpumask_var_t mask) { return mask != NULL; } #else #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly static __always_inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return true; } static __always_inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return true; } static __always_inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { cpumask_clear(*mask); return true; } static __always_inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { cpumask_clear(*mask); return true; } static __always_inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } static __always_inline void free_cpumask_var(cpumask_var_t mask) { } static __always_inline void free_bootmem_cpumask_var(cpumask_var_t mask) { } static __always_inline bool cpumask_available(cpumask_var_t mask) { return true; } #endif /* CONFIG_CPUMASK_OFFSTACK */ DEFINE_FREE(free_cpumask_var, struct cpumask *, if (_T) free_cpumask_var(_T)); /* It's common to want to use cpu_all_mask in struct member initializers, * so it has to refer to an address rather than a pointer. */ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define cpu_all_mask to_cpumask(cpu_all_bits) /* First bits of cpu_bit_bitmap are in fact unset. */ #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0]) #if NR_CPUS == 1 /* Uniprocessor: the possible/online/present masks are always "1" */ #define for_each_possible_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_online_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_present_cpu(cpu) for ((cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_possible_cpu_wrap(cpu, start) \ for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) #define for_each_online_cpu_wrap(cpu, start) \ for ((void)(start), (cpu) = 0; (cpu) < 1; (cpu)++) #else #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_enabled_cpu(cpu) for_each_cpu((cpu), cpu_enabled_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) #define for_each_possible_cpu_wrap(cpu, start) \ for_each_cpu_wrap((cpu), cpu_possible_mask, (start)) #define for_each_online_cpu_wrap(cpu, start) \ for_each_cpu_wrap((cpu), cpu_online_mask, (start)) #endif /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); #define assign_cpu(cpu, mask, val) \ assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) #define __assign_cpu(cpu, mask, val) \ __assign_bit(cpumask_check(cpu), cpumask_bits(mask), (val)) #define set_cpu_possible(cpu, possible) assign_cpu((cpu), &__cpu_possible_mask, (possible)) #define set_cpu_enabled(cpu, enabled) assign_cpu((cpu), &__cpu_enabled_mask, (enabled)) #define set_cpu_present(cpu, present) assign_cpu((cpu), &__cpu_present_mask, (present)) #define set_cpu_active(cpu, active) assign_cpu((cpu), &__cpu_active_mask, (active)) #define set_cpu_dying(cpu, dying) assign_cpu((cpu), &__cpu_dying_mask, (dying)) void set_cpu_online(unsigned int cpu, bool online); /** * to_cpumask - convert a NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap * * There are a few places where cpumask_var_t isn't appropriate and * static cpumasks must be used (eg. very early boot), yet we don't * expose the definition of 'struct cpumask'. * * This does the conversion, and can be used as a constant initializer. */ #define to_cpumask(bitmap) \ ((struct cpumask *)(1 ? (bitmap) \ : (void *)sizeof(__check_is_bitmap(bitmap)))) static __always_inline int __check_is_bitmap(const unsigned long *bitmap) { return 1; } /* * Special-case data structure for "single bit set only" constant CPU masks. * * We pre-generate all the 64 (or 32) possible bit positions, with enough * padding to the left and the right, and return the constant pointer * appropriately offset. */ extern const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; static __always_inline const struct cpumask *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return to_cpumask(p); } #if NR_CPUS > 1 /** * num_online_cpus() - Read the number of online CPUs * * Despite the fact that __num_online_cpus is of type atomic_t, this * interface gives only a momentary snapshot and is not protected against * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. * * Return: momentary snapshot of the number of online CPUs */ static __always_inline unsigned int num_online_cpus(void) { return raw_atomic_read(&__num_online_cpus); } #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_enabled_cpus() cpumask_weight(cpu_enabled_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) static __always_inline bool cpu_online(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_online_mask); } static __always_inline bool cpu_enabled(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_enabled_mask); } static __always_inline bool cpu_possible(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_possible_mask); } static __always_inline bool cpu_present(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_present_mask); } static __always_inline bool cpu_active(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_active_mask); } static __always_inline bool cpu_dying(unsigned int cpu) { return cpumask_test_cpu(cpu, cpu_dying_mask); } #else #define num_online_cpus() 1U #define num_possible_cpus() 1U #define num_enabled_cpus() 1U #define num_present_cpus() 1U #define num_active_cpus() 1U static __always_inline bool cpu_online(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_possible(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_enabled(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_present(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_active(unsigned int cpu) { return cpu == 0; } static __always_inline bool cpu_dying(unsigned int cpu) { return false; } #endif /* NR_CPUS > 1 */ #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) #if NR_CPUS <= BITS_PER_LONG #define CPU_BITS_ALL \ { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #else /* NR_CPUS > BITS_PER_LONG */ #define CPU_BITS_ALL \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #endif /* NR_CPUS > BITS_PER_LONG */ /** * cpumap_print_to_pagebuf - copies the cpumask into the buffer either * as comma-separated list of cpus or hex values of cpumask * @list: indicates whether the cpumap must be list * @mask: the cpumask to copy * @buf: the buffer to copy into * * Return: the length of the (null-terminated) @buf string, zero if * nothing is copied. */ static __always_inline ssize_t cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) { return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), nr_cpu_ids); } /** * cpumap_print_bitmask_to_buf - copies the cpumask into the buffer as * hex values of cpumask * * @buf: the buffer to copy into * @mask: the cpumask to copy * @off: in the string from which we are copying, we copy to @buf * @count: the maximum number of bytes to print * * The function prints the cpumask into the buffer as hex values of * cpumask; Typically used by bin_attribute to export cpumask bitmask * ABI. * * Return: the length of how many bytes have been copied, excluding * terminating '\0'. */ static __always_inline ssize_t cpumap_print_bitmask_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_bitmask_to_buf(buf, cpumask_bits(mask), nr_cpu_ids, off, count) - 1; } /** * cpumap_print_list_to_buf - copies the cpumask into the buffer as * comma-separated list of cpus * @buf: the buffer to copy into * @mask: the cpumask to copy * @off: in the string from which we are copying, we copy to @buf * @count: the maximum number of bytes to print * * Everything is same with the above cpumap_print_bitmask_to_buf() * except the print format. * * Return: the length of how many bytes have been copied, excluding * terminating '\0'. */ static __always_inline ssize_t cpumap_print_list_to_buf(char *buf, const struct cpumask *mask, loff_t off, size_t count) { return bitmap_print_list_to_buf(buf, cpumask_bits(mask), nr_cpu_ids, off, count) - 1; } #if NR_CPUS <= BITS_PER_LONG #define CPU_MASK_ALL \ (cpumask_t) { { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #else #define CPU_MASK_ALL \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #endif /* NR_CPUS > BITS_PER_LONG */ #define CPU_MASK_NONE \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } } #define CPU_MASK_CPU0 \ (cpumask_t) { { \ [0] = 1UL \ } } /* * Provide a valid theoretical max size for cpumap and cpulist sysfs files * to avoid breaking userspace which may allocate a buffer based on the size * reported by e.g. fstat. * * for cpumap NR_CPUS * 9/32 - 1 should be an exact length. * * For cpulist 7 is (ceil(log10(NR_CPUS)) + 1) allowing for NR_CPUS to be up * to 2 orders of magnitude larger than 8192. And then we divide by 2 to * cover a worst-case of every other cpu being on one of two nodes for a * very large NR_CPUS. * * Use PAGE_SIZE as a minimum for smaller configurations while avoiding * unsigned comparison to -1. */ #define CPUMAP_FILE_MAX_BYTES (((NR_CPUS * 9)/32 > PAGE_SIZE) \ ? (NR_CPUS * 9)/32 - 1 : PAGE_SIZE) #define CPULIST_FILE_MAX_BYTES (((NR_CPUS * 7)/2 > PAGE_SIZE) ? (NR_CPUS * 7)/2 : PAGE_SIZE) #endif /* __LINUX_CPUMASK_H */
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 // SPDX-License-Identifier: GPL-2.0-only /* * This file contains vfs directory ops for the 9P2000 protocol. * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #include <linux/module.h> #include <linux/errno.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/uio.h> #include <linux/fscache.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include "v9fs.h" #include "v9fs_vfs.h" #include "fid.h" /** * struct p9_rdir - readdir accounting * @head: start offset of current dirread buffer * @tail: end offset of current dirread buffer * @buf: dirread buffer * * private structure for keeping track of readdir * allocated on demand */ struct p9_rdir { int head; int tail; uint8_t buf[]; }; /** * dt_type - return file type * @mistat: mistat structure * */ static inline int dt_type(struct p9_wstat *mistat) { unsigned long perm = mistat->mode; int rettype = DT_REG; if (perm & P9_DMDIR) rettype = DT_DIR; if (perm & P9_DMSYMLINK) rettype = DT_LNK; return rettype; } /** * v9fs_alloc_rdir_buf - Allocate buffer used for read and readdir * @filp: opened file structure * @buflen: Length in bytes of buffer to allocate * */ static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen) { struct p9_fid *fid = filp->private_data; if (!fid->rdir) fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); return fid->rdir; } /** * v9fs_dir_readdir - iterate through a directory * @file: opened file structure * @ctx: actor we feed the entries to * */ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) { bool over; struct p9_wstat st; int err = 0; struct p9_fid *fid; int buflen; struct p9_rdir *rdir; struct kvec kvec; p9_debug(P9_DEBUG_VFS, "name %pD\n", file); fid = file->private_data; buflen = fid->clnt->msize - P9_IOHDRSZ; rdir = v9fs_alloc_rdir_buf(file, buflen); if (!rdir) return -ENOMEM; kvec.iov_base = rdir->buf; kvec.iov_len = buflen; while (1) { if (rdir->tail == rdir->head) { struct iov_iter to; int n; iov_iter_kvec(&to, ITER_DEST, &kvec, 1, buflen); n = p9_client_read(file->private_data, ctx->pos, &to, &err); if (err) return err; if (n == 0) return 0; rdir->head = 0; rdir->tail = n; } while (rdir->head < rdir->tail) { err = p9stat_read(fid->clnt, rdir->buf + rdir->head, rdir->tail - rdir->head, &st); if (err <= 0) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); return -EIO; } over = !dir_emit(ctx, st.name, strlen(st.name), QID2INO(&st.qid), dt_type(&st)); p9stat_free(&st); if (over) return 0; rdir->head += err; ctx->pos += err; } } } /** * v9fs_dir_readdir_dotl - iterate through a directory * @file: opened file structure * @ctx: actor we feed the entries to * */ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) { int err = 0; struct p9_fid *fid; int buflen; struct p9_rdir *rdir; struct p9_dirent curdirent; p9_debug(P9_DEBUG_VFS, "name %pD\n", file); fid = file->private_data; buflen = fid->clnt->msize - P9_READDIRHDRSZ; rdir = v9fs_alloc_rdir_buf(file, buflen); if (!rdir) return -ENOMEM; while (1) { if (rdir->tail == rdir->head) { err = p9_client_readdir(fid, rdir->buf, buflen, ctx->pos); if (err <= 0) return err; rdir->head = 0; rdir->tail = err; } while (rdir->head < rdir->tail) { err = p9dirent_read(fid->clnt, rdir->buf + rdir->head, rdir->tail - rdir->head, &curdirent); if (err < 0) { p9_debug(P9_DEBUG_VFS, "returned %d\n", err); return -EIO; } if (!dir_emit(ctx, curdirent.d_name, strlen(curdirent.d_name), QID2INO(&curdirent.qid), curdirent.d_type)) return 0; ctx->pos = curdirent.d_off; rdir->head += err; } } } /** * v9fs_dir_release - close a directory or a file * @inode: inode of the directory or file * @filp: file pointer to a directory or file * */ int v9fs_dir_release(struct inode *inode, struct file *filp) { struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid; __le32 version; loff_t i_size; int retval = 0, put_err; fid = filp->private_data; p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", inode, filp, fid ? fid->fid : -1); if (fid) { if ((S_ISREG(inode->i_mode)) && (filp->f_mode & FMODE_WRITE)) retval = filemap_fdatawrite(inode->i_mapping); spin_lock(&inode->i_lock); hlist_del(&fid->ilist); spin_unlock(&inode->i_lock); put_err = p9_fid_put(fid); retval = retval < 0 ? retval : put_err; } if ((filp->f_mode & FMODE_WRITE)) { version = cpu_to_le32(v9inode->qid.version); i_size = i_size_read(inode); fscache_unuse_cookie(v9fs_inode_cookie(v9inode), &version, &i_size); } else { fscache_unuse_cookie(v9fs_inode_cookie(v9inode), NULL, NULL); } return retval; } const struct file_operations v9fs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, .iterate_shared = v9fs_dir_readdir, .open = v9fs_file_open, .release = v9fs_dir_release, }; const struct file_operations v9fs_dir_operations_dotl = { .read = generic_read_dir, .llseek = generic_file_llseek, .iterate_shared = v9fs_dir_readdir_dotl, .open = v9fs_file_open, .release = v9fs_dir_release, .fsync = v9fs_file_fsync_dotl, };
1143 7 797 24 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the IP protocol. * * Version: @(#)ip.h 1.0.2 04/28/93 * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IP_H #define _LINUX_IP_H #include <linux/skbuff.h> #include <uapi/linux/ip.h> static inline struct iphdr *ip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_network_header(skb); } static inline struct iphdr *inner_ip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_inner_network_header(skb); } static inline struct iphdr *ipip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_transport_header(skb); } static inline unsigned int ip_transport_len(const struct sk_buff *skb) { return ntohs(ip_hdr(skb)->tot_len) - skb_network_header_len(skb); } static inline unsigned int iph_totlen(const struct sk_buff *skb, const struct iphdr *iph) { u32 len = ntohs(iph->tot_len); return (len || !skb_is_gso(skb) || !skb_is_gso_tcp(skb)) ? len : skb->len - skb_network_offset(skb); } static inline unsigned int skb_ip_totlen(const struct sk_buff *skb) { return iph_totlen(skb, ip_hdr(skb)); } /* IPv4 datagram length is stored into 16bit field (tot_len) */ #define IP_MAX_MTU 0xFFFFU static inline void iph_set_totlen(struct iphdr *iph, unsigned int len) { iph->tot_len = len <= IP_MAX_MTU ? htons(len) : 0; } #endif /* _LINUX_IP_H */
6 3 1 1 1 1 1 299 4 5 5 4 65 11 51 2 6 7 55 13 5 13 26 55 284 14 294 294 13 4 294 294 182 13 294 294 293 6 3 105 6 6 1 1 1 14 13 2 4 7 7 10 1 6 6 2 2 14 13 3 14 14 7 11 7 6 7 6 10 10 13 14 7 6 13 3 3 2 1 68 2 66 8 2 1 8 105 104 10 4 10 13 13 13 13 14 16 2 105 7 6 1 2 14 14 105 7 14 7 7 4 13 7 7 5 14 14 7 7 105 105 105 14 105 14 105 7 7 105 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/pnode.c * * (C) Copyright IBM Corporation 2005. * Author : Ram Pai (linuxram@us.ibm.com) */ #include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/fs.h> #include <linux/nsproxy.h> #include <uapi/linux/mount.h> #include "internal.h" #include "pnode.h" /* return the next shared peer mount of @p */ static inline struct mount *next_peer(struct mount *p) { return list_entry(p->mnt_share.next, struct mount, mnt_share); } static inline struct mount *first_slave(struct mount *p) { return hlist_entry(p->mnt_slave_list.first, struct mount, mnt_slave); } static inline struct mount *next_slave(struct mount *p) { return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave); } /* locks: namespace_shared && is_mounted(mnt) */ static struct mount *get_peer_under_root(struct mount *mnt, struct mnt_namespace *ns, const struct path *root) { struct mount *m = mnt; do { /* Check the namespace first for optimization */ if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root)) return m; m = next_peer(m); } while (m != mnt); return NULL; } /* * Get ID of closest dominating peer group having a representative * under the given root. * * locks: namespace_shared */ int get_dominating_id(struct mount *mnt, const struct path *root) { struct mount *m; for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) { struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root); if (d) return d->mnt_group_id; } return 0; } static inline bool will_be_unmounted(struct mount *m) { return m->mnt.mnt_flags & MNT_UMOUNT; } static void transfer_propagation(struct mount *mnt, struct mount *to) { struct hlist_node *p = NULL, *n; struct mount *m; hlist_for_each_entry_safe(m, n, &mnt->mnt_slave_list, mnt_slave) { m->mnt_master = to; if (!to) hlist_del_init(&m->mnt_slave); else p = &m->mnt_slave; } if (p) hlist_splice_init(&mnt->mnt_slave_list, p, &to->mnt_slave_list); } /* * EXCL[namespace_sem] */ void change_mnt_propagation(struct mount *mnt, int type) { struct mount *m = mnt->mnt_master; if (type == MS_SHARED) { set_mnt_shared(mnt); return; } if (IS_MNT_SHARED(mnt)) { if (list_empty(&mnt->mnt_share)) { mnt_release_group_id(mnt); } else { m = next_peer(mnt); list_del_init(&mnt->mnt_share); mnt->mnt_group_id = 0; } CLEAR_MNT_SHARED(mnt); transfer_propagation(mnt, m); } hlist_del_init(&mnt->mnt_slave); if (type == MS_SLAVE) { mnt->mnt_master = m; if (m) hlist_add_head(&mnt->mnt_slave, &m->mnt_slave_list); } else { mnt->mnt_master = NULL; if (type == MS_UNBINDABLE) mnt->mnt_t_flags |= T_UNBINDABLE; else mnt->mnt_t_flags &= ~T_UNBINDABLE; } } static struct mount *trace_transfers(struct mount *m) { while (1) { struct mount *next = next_peer(m); if (next != m) { list_del_init(&m->mnt_share); m->mnt_group_id = 0; m->mnt_master = next; } else { if (IS_MNT_SHARED(m)) mnt_release_group_id(m); next = m->mnt_master; } hlist_del_init(&m->mnt_slave); CLEAR_MNT_SHARED(m); SET_MNT_MARK(m); if (!next || !will_be_unmounted(next)) return next; if (IS_MNT_MARKED(next)) return next->mnt_master; m = next; } } static void set_destinations(struct mount *m, struct mount *master) { struct mount *next; while ((next = m->mnt_master) != master) { m->mnt_master = master; m = next; } } void bulk_make_private(struct list_head *set) { struct mount *m; list_for_each_entry(m, set, mnt_list) if (!IS_MNT_MARKED(m)) set_destinations(m, trace_transfers(m)); list_for_each_entry(m, set, mnt_list) { transfer_propagation(m, m->mnt_master); m->mnt_master = NULL; CLEAR_MNT_MARK(m); } } static struct mount *__propagation_next(struct mount *m, struct mount *origin) { while (1) { struct mount *master = m->mnt_master; if (master == origin->mnt_master) { struct mount *next = next_peer(m); return (next == origin) ? NULL : next; } else if (m->mnt_slave.next) return next_slave(m); /* back at master */ m = master; } } /* * get the next mount in the propagation tree. * @m: the mount seen last * @origin: the original mount from where the tree walk initiated * * Note that peer groups form contiguous segments of slave lists. * We rely on that in get_source() to be able to find out if * vfsmount found while iterating with propagation_next() is * a peer of one we'd found earlier. */ static struct mount *propagation_next(struct mount *m, struct mount *origin) { /* are there any slaves of this mount? */ if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list)) return first_slave(m); return __propagation_next(m, origin); } static struct mount *skip_propagation_subtree(struct mount *m, struct mount *origin) { /* * Advance m past everything that gets propagation from it. */ struct mount *p = __propagation_next(m, origin); while (p && peers(m, p)) p = __propagation_next(p, origin); return p; } static struct mount *next_group(struct mount *m, struct mount *origin) { while (1) { while (1) { struct mount *next; if (!IS_MNT_NEW(m) && !hlist_empty(&m->mnt_slave_list)) return first_slave(m); next = next_peer(m); if (m->mnt_group_id == origin->mnt_group_id) { if (next == origin) return NULL; } else if (m->mnt_slave.next != &next->mnt_slave) break; m = next; } /* m is the last peer */ while (1) { struct mount *master = m->mnt_master; if (m->mnt_slave.next) return next_slave(m); m = next_peer(master); if (master->mnt_group_id == origin->mnt_group_id) break; if (master->mnt_slave.next == &m->mnt_slave) break; m = master; } if (m == origin) return NULL; } } static bool need_secondary(struct mount *m, struct mountpoint *dest_mp) { /* skip ones added by this propagate_mnt() */ if (IS_MNT_NEW(m)) return false; /* skip if mountpoint isn't visible in m */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) return false; /* skip if m is in the anon_ns */ if (is_anon_ns(m->mnt_ns)) return false; return true; } static struct mount *find_master(struct mount *m, struct mount *last_copy, struct mount *original) { struct mount *p; // ascend until there's a copy for something with the same master for (;;) { p = m->mnt_master; if (!p || IS_MNT_MARKED(p)) break; m = p; } while (!peers(last_copy, original)) { struct mount *parent = last_copy->mnt_parent; if (parent->mnt_master == p) { if (!peers(parent, m)) last_copy = last_copy->mnt_master; break; } last_copy = last_copy->mnt_master; } return last_copy; } /** * propagate_mnt() - create secondary copies for tree attachment * @dest_mnt: destination mount. * @dest_mp: destination mountpoint. * @source_mnt: source mount. * @tree_list: list of secondaries to be attached. * * Create secondary copies for attaching a tree with root @source_mnt * at mount @dest_mnt with mountpoint @dest_mp. Link all new mounts * into a propagation graph. Set mountpoints for all secondaries, * link their roots into @tree_list via ->mnt_hash. */ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, struct mount *source_mnt, struct hlist_head *tree_list) { struct mount *m, *n, *copy, *this; int err = 0, type; if (dest_mnt->mnt_master) SET_MNT_MARK(dest_mnt->mnt_master); /* iterate over peer groups, depth first */ for (m = dest_mnt; m && !err; m = next_group(m, dest_mnt)) { if (m == dest_mnt) { // have one for dest_mnt itself copy = source_mnt; type = CL_MAKE_SHARED; n = next_peer(m); if (n == m) continue; } else { type = CL_SLAVE; /* beginning of peer group among the slaves? */ if (IS_MNT_SHARED(m)) type |= CL_MAKE_SHARED; n = m; } do { if (!need_secondary(n, dest_mp)) continue; if (type & CL_SLAVE) // first in this peer group copy = find_master(n, copy, source_mnt); this = copy_tree(copy, copy->mnt.mnt_root, type); if (IS_ERR(this)) { err = PTR_ERR(this); break; } scoped_guard(mount_locked_reader) mnt_set_mountpoint(n, dest_mp, this); if (n->mnt_master) SET_MNT_MARK(n->mnt_master); copy = this; hlist_add_head(&this->mnt_hash, tree_list); err = count_mounts(n->mnt_ns, this); if (err) break; type = CL_MAKE_SHARED; } while ((n = next_peer(n)) != m); } hlist_for_each_entry(n, tree_list, mnt_hash) { m = n->mnt_parent; if (m->mnt_master) CLEAR_MNT_MARK(m->mnt_master); } if (dest_mnt->mnt_master) CLEAR_MNT_MARK(dest_mnt->mnt_master); return err; } /* * return true if the refcount is greater than count */ static inline int do_refcount_check(struct mount *mnt, int count) { return mnt_get_count(mnt) > count; } /** * propagation_would_overmount - check whether propagation from @from * would overmount @to * @from: shared mount * @to: mount to check * @mp: future mountpoint of @to on @from * * If @from propagates mounts to @to, @from and @to must either be peers * or one of the masters in the hierarchy of masters of @to must be a * peer of @from. * * If the root of the @to mount is equal to the future mountpoint @mp of * the @to mount on @from then @to will be overmounted by whatever is * propagated to it. * * Context: This function expects namespace_lock() to be held and that * @mp is stable. * Return: If @from overmounts @to, true is returned, false if not. */ bool propagation_would_overmount(const struct mount *from, const struct mount *to, const struct mountpoint *mp) { if (!IS_MNT_SHARED(from)) return false; if (to->mnt.mnt_root != mp->m_dentry) return false; for (const struct mount *m = to; m; m = m->mnt_master) { if (peers(from, m)) return true; } return false; } /* * check if the mount 'mnt' can be unmounted successfully. * @mnt: the mount to be checked for unmount * NOTE: unmounting 'mnt' would naturally propagate to all * other mounts its parent propagates to. * Check if any of these mounts that **do not have submounts** * have more references than 'refcnt'. If so return busy. * * vfsmount lock must be held for write */ int propagate_mount_busy(struct mount *mnt, int refcnt) { struct mount *parent = mnt->mnt_parent; /* * quickly check if the current mount can be unmounted. * If not, we don't have to go checking for all other * mounts */ if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt)) return 1; if (mnt == parent) return 0; for (struct mount *m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { struct list_head *head; struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (!child) continue; head = &child->mnt_mounts; if (!list_empty(head)) { /* * a mount that covers child completely wouldn't prevent * it being pulled out; any other would. */ if (!list_is_singular(head) || !child->overmount) continue; } if (do_refcount_check(child, 1)) return 1; } return 0; } /* * Clear MNT_LOCKED when it can be shown to be safe. * * mount_lock lock must be held for write */ void propagate_mount_unlock(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m, *child; BUG_ON(parent == mnt); for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (child) child->mnt.mnt_flags &= ~MNT_LOCKED; } } static inline bool is_candidate(struct mount *m) { return m->mnt_t_flags & T_UMOUNT_CANDIDATE; } static void umount_one(struct mount *m, struct list_head *to_umount) { m->mnt.mnt_flags |= MNT_UMOUNT; list_del_init(&m->mnt_child); move_from_ns(m); list_add_tail(&m->mnt_list, to_umount); } static void remove_from_candidate_list(struct mount *m) { m->mnt_t_flags &= ~(T_MARKED | T_UMOUNT_CANDIDATE); list_del_init(&m->mnt_list); } static void gather_candidates(struct list_head *set, struct list_head *candidates) { struct mount *m, *p, *q; list_for_each_entry(m, set, mnt_list) { if (is_candidate(m)) continue; m->mnt_t_flags |= T_UMOUNT_CANDIDATE; p = m->mnt_parent; q = propagation_next(p, p); while (q) { struct mount *child = __lookup_mnt(&q->mnt, m->mnt_mountpoint); if (child) { /* * We might've already run into this one. That * must've happened on earlier iteration of the * outer loop; in that case we can skip those * parents that get propagation from q - there * will be nothing new on those as well. */ if (is_candidate(child)) { q = skip_propagation_subtree(q, p); continue; } child->mnt_t_flags |= T_UMOUNT_CANDIDATE; if (!will_be_unmounted(child)) list_add(&child->mnt_list, candidates); } q = propagation_next(q, p); } } list_for_each_entry(m, set, mnt_list) m->mnt_t_flags &= ~T_UMOUNT_CANDIDATE; } /* * We know that some child of @m can't be unmounted. In all places where the * chain of descent of @m has child not overmounting the root of parent, * the parent can't be unmounted either. */ static void trim_ancestors(struct mount *m) { struct mount *p; for (p = m->mnt_parent; is_candidate(p); m = p, p = p->mnt_parent) { if (IS_MNT_MARKED(m)) // all candidates beneath are overmounts return; SET_MNT_MARK(m); if (m != p->overmount) p->mnt_t_flags &= ~T_UMOUNT_CANDIDATE; } } /* * Find and exclude all umount candidates forbidden by @m * (see Documentation/filesystems/propagate_umount.txt) * If we can immediately tell that @m is OK to unmount (unlocked * and all children are already committed to unmounting) commit * to unmounting it. * Only @m itself might be taken from the candidates list; * anything found by trim_ancestors() is marked non-candidate * and left on the list. */ static void trim_one(struct mount *m, struct list_head *to_umount) { bool remove_this = false, found = false, umount_this = false; struct mount *n; if (!is_candidate(m)) { // trim_ancestors() left it on list remove_from_candidate_list(m); return; } list_for_each_entry(n, &m->mnt_mounts, mnt_child) { if (!is_candidate(n)) { found = true; if (n != m->overmount) { remove_this = true; break; } } } if (found) { trim_ancestors(m); } else if (!IS_MNT_LOCKED(m) && list_empty(&m->mnt_mounts)) { remove_this = true; umount_this = true; } if (remove_this) { remove_from_candidate_list(m); if (umount_this) umount_one(m, to_umount); } } static void handle_locked(struct mount *m, struct list_head *to_umount) { struct mount *cutoff = m, *p; if (!is_candidate(m)) { // trim_ancestors() left it on list remove_from_candidate_list(m); return; } for (p = m; is_candidate(p); p = p->mnt_parent) { remove_from_candidate_list(p); if (!IS_MNT_LOCKED(p)) cutoff = p->mnt_parent; } if (will_be_unmounted(p)) cutoff = p; while (m != cutoff) { umount_one(m, to_umount); m = m->mnt_parent; } } /* * @m is not to going away, and it overmounts the top of a stack of mounts * that are going away. We know that all of those are fully overmounted * by the one above (@m being the topmost of the chain), so @m can be slid * in place where the bottom of the stack is attached. * * NOTE: here we temporarily violate a constraint - two mounts end up with * the same parent and mountpoint; that will be remedied as soon as we * return from propagate_umount() - its caller (umount_tree()) will detach * the stack from the parent it (and now @m) is attached to. umount_tree() * might choose to keep unmounted pieces stuck to each other, but it always * detaches them from the mounts that remain in the tree. */ static void reparent(struct mount *m) { struct mount *p = m; struct mountpoint *mp; do { mp = p->mnt_mp; p = p->mnt_parent; } while (will_be_unmounted(p)); mnt_change_mountpoint(p, mp, m); mnt_notify_add(m); } /** * propagate_umount - apply propagation rules to the set of mounts for umount() * @set: the list of mounts to be unmounted. * * Collect all mounts that receive propagation from the mount in @set and have * no obstacles to being unmounted. Add these additional mounts to the set. * * See Documentation/filesystems/propagate_umount.txt if you do anything in * this area. * * Locks held: * mount_lock (write_seqlock), namespace_sem (exclusive). */ void propagate_umount(struct list_head *set) { struct mount *m, *p; LIST_HEAD(to_umount); // committed to unmounting LIST_HEAD(candidates); // undecided umount candidates // collect all candidates gather_candidates(set, &candidates); // reduce the set until it's non-shifting list_for_each_entry_safe(m, p, &candidates, mnt_list) trim_one(m, &to_umount); // ... and non-revealing while (!list_empty(&candidates)) { m = list_first_entry(&candidates,struct mount, mnt_list); handle_locked(m, &to_umount); } // now to_umount consists of all acceptable candidates // deal with reparenting of surviving overmounts on those list_for_each_entry(m, &to_umount, mnt_list) { struct mount *over = m->overmount; if (over && !will_be_unmounted(over)) reparent(over); } // and fold them into the set list_splice_tail_init(&to_umount, set); }
408 839 409 1 1096 838 332 409 408 1096 409 332 409 1153 1153 52 52 52 641 641 448 313 314 315 316 1213 37 54 805 712 270 5 54 1212 1209 1095 821 223 54 1094 2 1093 1089 5 1092 308 640 308 638 637 5 639 415 522 54 69 52 1 52 1094 54 75 970 740 740 247 248 248 3814 3812 3811 108 11 71 186 186 186 22 181 186 215 212 4 29 29 5 5 417 417 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 /* netfilter.c: look after the filters for various protocols. * Heavily influenced by the old firewall.c by David Bonn and Alan Cox. * * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any * way. * * This code is GPL. */ #include <linux/kernel.h> #include <linux/netfilter.h> #include <net/protocol.h> #include <linux/init.h> #include <linux/skbuff.h> #include <linux/wait.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/if.h> #include <linux/netdevice.h> #include <linux/netfilter_ipv6.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/mutex.h> #include <linux/mm.h> #include <linux/rcupdate.h> #include <net/net_namespace.h> #include <net/netfilter/nf_queue.h> #include <net/sock.h> #include "nf_internals.h" const struct nf_ipv6_ops __rcu *nf_ipv6_ops __read_mostly; EXPORT_SYMBOL_GPL(nf_ipv6_ops); #ifdef CONFIG_JUMP_LABEL struct static_key nf_hooks_needed[NFPROTO_NUMPROTO][NF_MAX_HOOKS]; EXPORT_SYMBOL(nf_hooks_needed); #endif static DEFINE_MUTEX(nf_hook_mutex); /* max hooks per family/hooknum */ #define MAX_HOOK_COUNT 1024 #define nf_entry_dereference(e) \ rcu_dereference_protected(e, lockdep_is_held(&nf_hook_mutex)) static struct nf_hook_entries *allocate_hook_entries_size(u16 num) { struct nf_hook_entries *e; size_t alloc = sizeof(*e) + sizeof(struct nf_hook_entry) * num + sizeof(struct nf_hook_ops *) * num + sizeof(struct nf_hook_entries_rcu_head); if (num == 0) return NULL; e = kvzalloc(alloc, GFP_KERNEL_ACCOUNT); if (e) e->num_hook_entries = num; return e; } static void __nf_hook_entries_free(struct rcu_head *h) { struct nf_hook_entries_rcu_head *head; head = container_of(h, struct nf_hook_entries_rcu_head, head); kvfree(head->allocation); } static void nf_hook_entries_free(struct nf_hook_entries *e) { struct nf_hook_entries_rcu_head *head; struct nf_hook_ops **ops; unsigned int num; if (!e) return; num = e->num_hook_entries; ops = nf_hook_entries_get_hook_ops(e); head = (void *)&ops[num]; head->allocation = e; call_rcu(&head->head, __nf_hook_entries_free); } static unsigned int accept_all(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { return NF_ACCEPT; /* ACCEPT makes nf_hook_slow call next hook */ } static const struct nf_hook_ops dummy_ops = { .hook = accept_all, .priority = INT_MIN, }; static struct nf_hook_entries * nf_hook_entries_grow(const struct nf_hook_entries *old, const struct nf_hook_ops *reg) { unsigned int i, alloc_entries, nhooks, old_entries; struct nf_hook_ops **orig_ops = NULL; struct nf_hook_ops **new_ops; struct nf_hook_entries *new; bool inserted = false; alloc_entries = 1; old_entries = old ? old->num_hook_entries : 0; if (old) { orig_ops = nf_hook_entries_get_hook_ops(old); for (i = 0; i < old_entries; i++) { if (orig_ops[i] != &dummy_ops) alloc_entries++; /* Restrict BPF hook type to force a unique priority, not * shared at attach time. * * This is mainly to avoid ordering issues between two * different bpf programs, this doesn't prevent a normal * hook at same priority as a bpf one (we don't want to * prevent defrag, conntrack, iptables etc from attaching). */ if (reg->priority == orig_ops[i]->priority && reg->hook_ops_type == NF_HOOK_OP_BPF) return ERR_PTR(-EBUSY); } } if (alloc_entries > MAX_HOOK_COUNT) return ERR_PTR(-E2BIG); new = allocate_hook_entries_size(alloc_entries); if (!new) return ERR_PTR(-ENOMEM); new_ops = nf_hook_entries_get_hook_ops(new); i = 0; nhooks = 0; while (i < old_entries) { if (orig_ops[i] == &dummy_ops) { ++i; continue; } if (inserted || reg->priority > orig_ops[i]->priority) { new_ops[nhooks] = (void *)orig_ops[i]; new->hooks[nhooks] = old->hooks[i]; i++; } else { new_ops[nhooks] = (void *)reg; new->hooks[nhooks].hook = reg->hook; new->hooks[nhooks].priv = reg->priv; inserted = true; } nhooks++; } if (!inserted) { new_ops[nhooks] = (void *)reg; new->hooks[nhooks].hook = reg->hook; new->hooks[nhooks].priv = reg->priv; } return new; } static void hooks_validate(const struct nf_hook_entries *hooks) { #ifdef CONFIG_DEBUG_MISC struct nf_hook_ops **orig_ops; int prio = INT_MIN; size_t i = 0; orig_ops = nf_hook_entries_get_hook_ops(hooks); for (i = 0; i < hooks->num_hook_entries; i++) { if (orig_ops[i] == &dummy_ops) continue; WARN_ON(orig_ops[i]->priority < prio); if (orig_ops[i]->priority > prio) prio = orig_ops[i]->priority; } #endif } int nf_hook_entries_insert_raw(struct nf_hook_entries __rcu **pp, const struct nf_hook_ops *reg) { struct nf_hook_entries *new_hooks; struct nf_hook_entries *p; p = rcu_dereference_raw(*pp); new_hooks = nf_hook_entries_grow(p, reg); if (IS_ERR(new_hooks)) return PTR_ERR(new_hooks); hooks_validate(new_hooks); rcu_assign_pointer(*pp, new_hooks); BUG_ON(p == new_hooks); nf_hook_entries_free(p); return 0; } EXPORT_SYMBOL_GPL(nf_hook_entries_insert_raw); /* * __nf_hook_entries_try_shrink - try to shrink hook array * * @old -- current hook blob at @pp * @pp -- location of hook blob * * Hook unregistration must always succeed, so to-be-removed hooks * are replaced by a dummy one that will just move to next hook. * * This counts the current dummy hooks, attempts to allocate new blob, * copies the live hooks, then replaces and discards old one. * * return values: * * Returns address to free, or NULL. */ static void *__nf_hook_entries_try_shrink(struct nf_hook_entries *old, struct nf_hook_entries __rcu **pp) { unsigned int i, j, skip = 0, hook_entries; struct nf_hook_entries *new = NULL; struct nf_hook_ops **orig_ops; struct nf_hook_ops **new_ops; if (WARN_ON_ONCE(!old)) return NULL; orig_ops = nf_hook_entries_get_hook_ops(old); for (i = 0; i < old->num_hook_entries; i++) { if (orig_ops[i] == &dummy_ops) skip++; } /* if skip == hook_entries all hooks have been removed */ hook_entries = old->num_hook_entries; if (skip == hook_entries) goto out_assign; if (skip == 0) return NULL; hook_entries -= skip; new = allocate_hook_entries_size(hook_entries); if (!new) return NULL; new_ops = nf_hook_entries_get_hook_ops(new); for (i = 0, j = 0; i < old->num_hook_entries; i++) { if (orig_ops[i] == &dummy_ops) continue; new->hooks[j] = old->hooks[i]; new_ops[j] = (void *)orig_ops[i]; j++; } hooks_validate(new); out_assign: rcu_assign_pointer(*pp, new); return old; } static struct nf_hook_entries __rcu ** nf_hook_entry_head(struct net *net, int pf, unsigned int hooknum, struct net_device *dev) { switch (pf) { case NFPROTO_NETDEV: break; #ifdef CONFIG_NETFILTER_FAMILY_ARP case NFPROTO_ARP: if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_arp) <= hooknum)) return NULL; return net->nf.hooks_arp + hooknum; #endif #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE case NFPROTO_BRIDGE: if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_bridge) <= hooknum)) return NULL; return net->nf.hooks_bridge + hooknum; #endif #ifdef CONFIG_NETFILTER_INGRESS case NFPROTO_INET: if (WARN_ON_ONCE(hooknum != NF_INET_INGRESS)) return NULL; if (!dev || dev_net(dev) != net) { WARN_ON_ONCE(1); return NULL; } return &dev->nf_hooks_ingress; #endif case NFPROTO_IPV4: if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv4) <= hooknum)) return NULL; return net->nf.hooks_ipv4 + hooknum; case NFPROTO_IPV6: if (WARN_ON_ONCE(ARRAY_SIZE(net->nf.hooks_ipv6) <= hooknum)) return NULL; return net->nf.hooks_ipv6 + hooknum; default: WARN_ON_ONCE(1); return NULL; } #ifdef CONFIG_NETFILTER_INGRESS if (hooknum == NF_NETDEV_INGRESS) { if (dev && dev_net(dev) == net) return &dev->nf_hooks_ingress; } #endif #ifdef CONFIG_NETFILTER_EGRESS if (hooknum == NF_NETDEV_EGRESS) { if (dev && dev_net(dev) == net) return &dev->nf_hooks_egress; } #endif WARN_ON_ONCE(1); return NULL; } static int nf_ingress_check(struct net *net, const struct nf_hook_ops *reg, int hooknum) { #ifndef CONFIG_NETFILTER_INGRESS if (reg->hooknum == hooknum) return -EOPNOTSUPP; #endif if (reg->hooknum != hooknum || !reg->dev || dev_net(reg->dev) != net) return -EINVAL; return 0; } static inline bool __maybe_unused nf_ingress_hook(const struct nf_hook_ops *reg, int pf) { if ((pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) || (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS)) return true; return false; } static inline bool __maybe_unused nf_egress_hook(const struct nf_hook_ops *reg, int pf) { return pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_EGRESS; } static void nf_static_key_inc(const struct nf_hook_ops *reg, int pf) { #ifdef CONFIG_JUMP_LABEL int hooknum; if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) { pf = NFPROTO_NETDEV; hooknum = NF_NETDEV_INGRESS; } else { hooknum = reg->hooknum; } static_key_slow_inc(&nf_hooks_needed[pf][hooknum]); #endif } static void nf_static_key_dec(const struct nf_hook_ops *reg, int pf) { #ifdef CONFIG_JUMP_LABEL int hooknum; if (pf == NFPROTO_INET && reg->hooknum == NF_INET_INGRESS) { pf = NFPROTO_NETDEV; hooknum = NF_NETDEV_INGRESS; } else { hooknum = reg->hooknum; } static_key_slow_dec(&nf_hooks_needed[pf][hooknum]); #endif } static int __nf_register_net_hook(struct net *net, int pf, const struct nf_hook_ops *reg) { struct nf_hook_entries *p, *new_hooks; struct nf_hook_entries __rcu **pp; int err; switch (pf) { case NFPROTO_NETDEV: #ifndef CONFIG_NETFILTER_INGRESS if (reg->hooknum == NF_NETDEV_INGRESS) return -EOPNOTSUPP; #endif #ifndef CONFIG_NETFILTER_EGRESS if (reg->hooknum == NF_NETDEV_EGRESS) return -EOPNOTSUPP; #endif if ((reg->hooknum != NF_NETDEV_INGRESS && reg->hooknum != NF_NETDEV_EGRESS) || !reg->dev || dev_net(reg->dev) != net) return -EINVAL; break; case NFPROTO_INET: if (reg->hooknum != NF_INET_INGRESS) break; err = nf_ingress_check(net, reg, NF_INET_INGRESS); if (err < 0) return err; break; } pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev); if (!pp) return -EINVAL; mutex_lock(&nf_hook_mutex); p = nf_entry_dereference(*pp); new_hooks = nf_hook_entries_grow(p, reg); if (!IS_ERR(new_hooks)) { hooks_validate(new_hooks); rcu_assign_pointer(*pp, new_hooks); } mutex_unlock(&nf_hook_mutex); if (IS_ERR(new_hooks)) return PTR_ERR(new_hooks); #ifdef CONFIG_NETFILTER_INGRESS if (nf_ingress_hook(reg, pf)) net_inc_ingress_queue(); #endif #ifdef CONFIG_NETFILTER_EGRESS if (nf_egress_hook(reg, pf)) net_inc_egress_queue(); #endif nf_static_key_inc(reg, pf); BUG_ON(p == new_hooks); nf_hook_entries_free(p); return 0; } /* * nf_remove_net_hook - remove a hook from blob * * @oldp: current address of hook blob * @unreg: hook to unregister * * This cannot fail, hook unregistration must always succeed. * Therefore replace the to-be-removed hook with a dummy hook. */ static bool nf_remove_net_hook(struct nf_hook_entries *old, const struct nf_hook_ops *unreg) { struct nf_hook_ops **orig_ops; unsigned int i; orig_ops = nf_hook_entries_get_hook_ops(old); for (i = 0; i < old->num_hook_entries; i++) { if (orig_ops[i] != unreg) continue; WRITE_ONCE(old->hooks[i].hook, accept_all); WRITE_ONCE(orig_ops[i], (void *)&dummy_ops); return true; } return false; } static void __nf_unregister_net_hook(struct net *net, int pf, const struct nf_hook_ops *reg) { struct nf_hook_entries __rcu **pp; struct nf_hook_entries *p; pp = nf_hook_entry_head(net, pf, reg->hooknum, reg->dev); if (!pp) return; mutex_lock(&nf_hook_mutex); p = nf_entry_dereference(*pp); if (WARN_ON_ONCE(!p)) { mutex_unlock(&nf_hook_mutex); return; } if (nf_remove_net_hook(p, reg)) { #ifdef CONFIG_NETFILTER_INGRESS if (nf_ingress_hook(reg, pf)) net_dec_ingress_queue(); #endif #ifdef CONFIG_NETFILTER_EGRESS if (nf_egress_hook(reg, pf)) net_dec_egress_queue(); #endif nf_static_key_dec(reg, pf); } else { WARN_ONCE(1, "hook not found, pf %d num %d", pf, reg->hooknum); } p = __nf_hook_entries_try_shrink(p, pp); mutex_unlock(&nf_hook_mutex); if (!p) return; nf_queue_nf_hook_drop(net); nf_hook_entries_free(p); } void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { if (reg->pf == NFPROTO_INET) { if (reg->hooknum == NF_INET_INGRESS) { __nf_unregister_net_hook(net, NFPROTO_INET, reg); } else { __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); __nf_unregister_net_hook(net, NFPROTO_IPV6, reg); } } else { __nf_unregister_net_hook(net, reg->pf, reg); } } EXPORT_SYMBOL(nf_unregister_net_hook); void nf_hook_entries_delete_raw(struct nf_hook_entries __rcu **pp, const struct nf_hook_ops *reg) { struct nf_hook_entries *p; p = rcu_dereference_raw(*pp); if (nf_remove_net_hook(p, reg)) { p = __nf_hook_entries_try_shrink(p, pp); nf_hook_entries_free(p); } } EXPORT_SYMBOL_GPL(nf_hook_entries_delete_raw); int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) { int err; if (reg->pf == NFPROTO_INET) { if (reg->hooknum == NF_INET_INGRESS) { err = __nf_register_net_hook(net, NFPROTO_INET, reg); if (err < 0) return err; } else { err = __nf_register_net_hook(net, NFPROTO_IPV4, reg); if (err < 0) return err; err = __nf_register_net_hook(net, NFPROTO_IPV6, reg); if (err < 0) { __nf_unregister_net_hook(net, NFPROTO_IPV4, reg); return err; } } } else { err = __nf_register_net_hook(net, reg->pf, reg); if (err < 0) return err; } return 0; } EXPORT_SYMBOL(nf_register_net_hook); int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int n) { unsigned int i; int err = 0; for (i = 0; i < n; i++) { err = nf_register_net_hook(net, &reg[i]); if (err) goto err; } return err; err: if (i > 0) nf_unregister_net_hooks(net, reg, i); return err; } EXPORT_SYMBOL(nf_register_net_hooks); void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, unsigned int hookcount) { unsigned int i; for (i = 0; i < hookcount; i++) nf_unregister_net_hook(net, &reg[i]); } EXPORT_SYMBOL(nf_unregister_net_hooks); /* Returns 1 if okfn() needs to be executed by the caller, * -EPERM for NF_DROP, 0 otherwise. Caller must hold rcu_read_lock. */ int nf_hook_slow(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *e, unsigned int s) { unsigned int verdict; int ret; for (; s < e->num_hook_entries; s++) { verdict = nf_hook_entry_hookfn(&e->hooks[s], skb, state); switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: break; case NF_DROP: kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP); ret = NF_DROP_GETERR(verdict); if (ret == 0) ret = -EPERM; return ret; case NF_QUEUE: ret = nf_queue(skb, state, s, verdict); if (ret == 1) continue; return ret; case NF_STOLEN: return NF_DROP_GETERR(verdict); default: WARN_ON_ONCE(1); return 0; } } return 1; } EXPORT_SYMBOL(nf_hook_slow); void nf_hook_slow_list(struct list_head *head, struct nf_hook_state *state, const struct nf_hook_entries *e) { struct sk_buff *skb, *next; LIST_HEAD(sublist); int ret; list_for_each_entry_safe(skb, next, head, list) { skb_list_del_init(skb); ret = nf_hook_slow(skb, state, e, 0); if (ret == 1) list_add_tail(&skb->list, &sublist); } /* Put passed packets back on main list */ list_splice(&sublist, head); } EXPORT_SYMBOL(nf_hook_slow_list); /* This needs to be compiled in any case to avoid dependencies between the * nfnetlink_queue code and nf_conntrack. */ const struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nfnl_ct_hook); const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_hook); const struct nf_defrag_hook __rcu *nf_defrag_v4_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_defrag_v4_hook); const struct nf_defrag_hook __rcu *nf_defrag_v6_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_defrag_v6_hook); #if IS_ENABLED(CONFIG_NF_CONNTRACK) u8 nf_ctnetlink_has_listener; EXPORT_SYMBOL_GPL(nf_ctnetlink_has_listener); const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly; EXPORT_SYMBOL_GPL(nf_nat_hook); /* This does not belong here, but locally generated errors need it if connection * tracking in use: without this, connection may not be in hash table, and hence * manufactured ICMP or RST packets will not be associated with it. */ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb) { const struct nf_ct_hook *ct_hook; if (skb->_nfct) { rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) ct_hook->attach(new, skb); rcu_read_unlock(); } } EXPORT_SYMBOL(nf_ct_attach); void nf_conntrack_destroy(struct nf_conntrack *nfct) { const struct nf_ct_hook *ct_hook; rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) ct_hook->destroy(nfct); rcu_read_unlock(); WARN_ON(!ct_hook); } EXPORT_SYMBOL(nf_conntrack_destroy); void nf_ct_set_closing(struct nf_conntrack *nfct) { const struct nf_ct_hook *ct_hook; if (!nfct) return; rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) ct_hook->set_closing(nfct); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_ct_set_closing); bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple, const struct sk_buff *skb) { const struct nf_ct_hook *ct_hook; bool ret = false; rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) ret = ct_hook->get_tuple_skb(dst_tuple, skb); rcu_read_unlock(); return ret; } EXPORT_SYMBOL(nf_ct_get_tuple_skb); /* Built-in default zone used e.g. by modules. */ const struct nf_conntrack_zone nf_ct_zone_dflt = { .id = NF_CT_DEFAULT_ZONE_ID, .dir = NF_CT_DEFAULT_ZONE_DIR, }; EXPORT_SYMBOL_GPL(nf_ct_zone_dflt); #endif /* CONFIG_NF_CONNTRACK */ static void __net_init __netfilter_net_init(struct nf_hook_entries __rcu **e, int max) { int h; for (h = 0; h < max; h++) RCU_INIT_POINTER(e[h], NULL); } static int __net_init netfilter_net_init(struct net *net) { __netfilter_net_init(net->nf.hooks_ipv4, ARRAY_SIZE(net->nf.hooks_ipv4)); __netfilter_net_init(net->nf.hooks_ipv6, ARRAY_SIZE(net->nf.hooks_ipv6)); #ifdef CONFIG_NETFILTER_FAMILY_ARP __netfilter_net_init(net->nf.hooks_arp, ARRAY_SIZE(net->nf.hooks_arp)); #endif #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE __netfilter_net_init(net->nf.hooks_bridge, ARRAY_SIZE(net->nf.hooks_bridge)); #endif #ifdef CONFIG_PROC_FS net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter", net->proc_net); if (!net->nf.proc_netfilter) { if (!net_eq(net, &init_net)) pr_err("cannot create netfilter proc entry"); return -ENOMEM; } #endif return 0; } static void __net_exit netfilter_net_exit(struct net *net) { remove_proc_entry("netfilter", net->proc_net); } static struct pernet_operations netfilter_net_ops = { .init = netfilter_net_init, .exit = netfilter_net_exit, }; int __init netfilter_init(void) { int ret; ret = register_pernet_subsys(&netfilter_net_ops); if (ret < 0) goto err; #ifdef CONFIG_LWTUNNEL ret = netfilter_lwtunnel_init(); if (ret < 0) goto err_lwtunnel_pernet; #endif ret = netfilter_log_init(); if (ret < 0) goto err_log_pernet; return 0; err_log_pernet: #ifdef CONFIG_LWTUNNEL netfilter_lwtunnel_fini(); err_lwtunnel_pernet: #endif unregister_pernet_subsys(&netfilter_net_ops); err: return ret; }
2096 1505 2 159 663 57 862 26 555 466 113 4 553 553 23 219 187 139 27 535 10 10 200 17 189 198 10 189 189 198 96 112 2 2 7 7 254 255 254 244 57 187 17 189 187 24 23 2 22 22 20 2 14 12 12 1 3 1 6 1 1 1 7 7 7 29 1 1 28 28 3 24 6 15 9 24 39 17 7 9 15 2 1 3 4 3 6 2 19 7 9 9 6 1 2 4 6 8 1 7 4 1 290 279 10 10 5 3 22 7 8 12 52 52 43 1 1 1 36 13 22 18 8 4038 3919 369 720 323 81 949 324 1959 5 2 2 7 7 2 6 7 417 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 // SPDX-License-Identifier: GPL-2.0-or-later /* linux/net/ipv4/arp.c * * Copyright (C) 1994 by Florian La Roche * * This module implements the Address Resolution Protocol ARP (RFC 826), * which is used to convert IP addresses (or in the future maybe other * high-level addresses) into a low-level hardware address (like an Ethernet * address). * * Fixes: * Alan Cox : Removed the Ethernet assumptions in * Florian's code * Alan Cox : Fixed some small errors in the ARP * logic * Alan Cox : Allow >4K in /proc * Alan Cox : Make ARP add its own protocol entry * Ross Martin : Rewrote arp_rcv() and arp_get_info() * Stephen Henson : Add AX25 support to arp_get_info() * Alan Cox : Drop data when a device is downed. * Alan Cox : Use init_timer(). * Alan Cox : Double lock fixes. * Martin Seine : Move the arphdr structure * to if_arp.h for compatibility. * with BSD based programs. * Andrew Tridgell : Added ARP netmask code and * re-arranged proxy handling. * Alan Cox : Changed to use notifiers. * Niibe Yutaka : Reply for this device or proxies only. * Alan Cox : Don't proxy across hardware types! * Jonathan Naylor : Added support for NET/ROM. * Mike Shaver : RFC1122 checks. * Jonathan Naylor : Only lookup the hardware address for * the correct hardware type. * Germano Caronni : Assorted subtle races. * Craig Schlenter : Don't modify permanent entry * during arp_rcv. * Russ Nelson : Tidied up a few bits. * Alexey Kuznetsov: Major changes to caching and behaviour, * eg intelligent arp probing and * generation * of host down events. * Alan Cox : Missing unlock in device events. * Eckes : ARP ioctl control errors. * Alexey Kuznetsov: Arp free fix. * Manuel Rodriguez: Gratuitous ARP. * Jonathan Layes : Added arpd support through kerneld * message queue (960314) * Mike Shaver : /proc/sys/net/ipv4/arp_* support * Mike McLagan : Routing by source * Stuart Cheshire : Metricom and grat arp fixes * *** FOR 2.1 clean this up *** * Lawrence V. Stefani: (08/12/96) Added FDDI support. * Alan Cox : Took the AP1000 nasty FDDI hack and * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... * Jes Sorensen : Make FDDI work again in 2.1.x and * clean up the APFDDI & gen. FDDI bits. * Alexey Kuznetsov: new arp state machine; * now it is in net/core/neighbour.c. * Krzysztof Halasa: Added Frame Relay ARP support. * Arnaldo C. Melo : convert /proc/net/arp to seq_file * Shmulik Hen: Split arp_send to arp_create and * arp_xmit so intermediate drivers like * bonding can change the skb before * sending (e.g. insert 8021q tag). * Harald Welte : convert to make use of jenkins hash * Jesper D. Brouer: Proxy ARP PVLAN RFC 3069 support. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/capability.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/mm.h> #include <linux/inet.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/fddidevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/net.h> #include <linux/rcupdate.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <net/net_namespace.h> #include <net/ip.h> #include <net/icmp.h> #include <net/route.h> #include <net/protocol.h> #include <net/tcp.h> #include <net/sock.h> #include <net/arp.h> #include <net/ax25.h> #include <net/netrom.h> #include <net/dst_metadata.h> #include <net/ip_tunnels.h> #include <linux/uaccess.h> #include <linux/netfilter_arp.h> /* * Interface to generic neighbour cache. */ static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); static bool arp_key_eq(const struct neighbour *n, const void *pkey); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); static void parp_redo(struct sk_buff *skb); static int arp_is_multicast(const void *pkey); static const struct neigh_ops arp_generic_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_connected_output, }; static const struct neigh_ops arp_hh_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, .output = neigh_resolve_output, .connected_output = neigh_resolve_output, }; static const struct neigh_ops arp_direct_ops = { .family = AF_INET, .output = neigh_direct_output, .connected_output = neigh_direct_output, }; struct neigh_table arp_tbl = { .family = AF_INET, .key_len = 4, .protocol = cpu_to_be16(ETH_P_IP), .hash = arp_hash, .key_eq = arp_key_eq, .constructor = arp_constructor, .proxy_redo = parp_redo, .is_multicast = arp_is_multicast, .id = "arp_cache", .parms = { .tbl = &arp_tbl, .reachable_time = 30 * HZ, .data = { [NEIGH_VAR_MCAST_PROBES] = 3, [NEIGH_VAR_UCAST_PROBES] = 3, [NEIGH_VAR_RETRANS_TIME] = 1 * HZ, [NEIGH_VAR_BASE_REACHABLE_TIME] = 30 * HZ, [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, [NEIGH_VAR_LOCKTIME] = 1 * HZ, }, }, .gc_interval = 30 * HZ, .gc_thresh1 = 128, .gc_thresh2 = 512, .gc_thresh3 = 1024, }; EXPORT_SYMBOL(arp_tbl); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) { switch (dev->type) { case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: ip_eth_mc_map(addr, haddr); return 0; case ARPHRD_INFINIBAND: ip_ib_mc_map(addr, dev->broadcast, haddr); return 0; case ARPHRD_IPGRE: ip_ipgre_mc_map(addr, dev->broadcast, haddr); return 0; default: if (dir) { memcpy(haddr, dev->broadcast, dev->addr_len); return 0; } } return -EINVAL; } static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 *hash_rnd) { return arp_hashfn(pkey, dev, hash_rnd); } static bool arp_key_eq(const struct neighbour *neigh, const void *pkey) { return neigh_key_eq32(neigh, pkey); } static int arp_constructor(struct neighbour *neigh) { __be32 addr; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; u32 inaddr_any = INADDR_ANY; if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) memcpy(neigh->primary_key, &inaddr_any, arp_tbl.key_len); addr = *(__be32 *)neigh->primary_key; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return -EINVAL; } neigh->type = inet_addr_type_dev_table(dev_net(dev), dev, addr); parms = in_dev->arp_parms; __neigh_parms_put(neigh->parms); neigh->parms = neigh_parms_clone(parms); rcu_read_unlock(); if (!dev->header_ops) { neigh->nud_state = NUD_NOARP; neigh->ops = &arp_direct_ops; neigh->output = neigh_direct_output; } else { /* Good devices (checked by reading texts, but only Ethernet is tested) ARPHRD_ETHER: (ethernet, apfddi) ARPHRD_FDDI: (fddi) ARPHRD_IEEE802: (tr) ARPHRD_METRICOM: (strip) ARPHRD_ARCNET: etc. etc. etc. ARPHRD_IPDDP will also work, if author repairs it. I did not it, because this driver does not work even in old paradigm. */ if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); } else if (neigh->type == RTN_BROADCAST || (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } if (dev->header_ops->cache) neigh->ops = &arp_hh_ops; else neigh->ops = &arp_generic_ops; if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; } return 0; } static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) { dst_link_failure(skb); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_FAILED); } /* Create and send an arp packet. */ static void arp_send_dst(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw, struct dst_entry *dst) { struct sk_buff *skb; /* arp on this interface. */ if (dev->flags & IFF_NOARP) return; skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); if (!skb) return; skb_dst_set(skb, dst_clone(dst)); arp_xmit(skb); } void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { arp_send_dst(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw, NULL); } EXPORT_SYMBOL(arp_send); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) { __be32 saddr = 0; u8 dst_ha[MAX_ADDR_LEN], *dst_hw = NULL; struct net_device *dev = neigh->dev; __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; struct dst_entry *dst = NULL; rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (!in_dev) { rcu_read_unlock(); return; } switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ if (skb && inet_addr_type_dev_table(dev_net(dev), dev, ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; saddr = ip_hdr(skb)->saddr; if (inet_addr_type_dev_table(dev_net(dev), dev, saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; } saddr = 0; break; case 2: /* Avoid secondary IPs, get a primary/preferred one */ break; } rcu_read_unlock(); if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); probes -= NEIGH_VAR(neigh->parms, UCAST_PROBES); if (probes < 0) { if (!(READ_ONCE(neigh->nud_state) & NUD_VALID)) pr_debug("trying to ucast probe in NUD_INVALID\n"); neigh_ha_snapshot(dst_ha, neigh, dev); dst_hw = dst_ha; } else { probes -= NEIGH_VAR(neigh->parms, APP_PROBES); if (probes < 0) { neigh_app_ns(neigh); return; } } if (skb && !(dev->priv_flags & IFF_XMIT_DST_RELEASE)) dst = skb_dst(skb); arp_send_dst(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, dst_hw, dev->dev_addr, NULL, dst); } static int arp_ignore(struct in_device *in_dev, __be32 sip, __be32 tip) { struct net *net = dev_net(in_dev->dev); int scope; switch (IN_DEV_ARP_IGNORE(in_dev)) { case 0: /* Reply, the tip is already validated */ return 0; case 1: /* Reply only if tip is configured on the incoming interface */ sip = 0; scope = RT_SCOPE_HOST; break; case 2: /* * Reply only if tip is configured on the incoming interface * and is in same subnet as sip */ scope = RT_SCOPE_HOST; break; case 3: /* Do not reply for scope host addresses */ sip = 0; scope = RT_SCOPE_LINK; in_dev = NULL; break; case 4: /* Reserved */ case 5: case 6: case 7: return 0; case 8: /* Do not reply */ return 1; default: return 0; } return !inet_confirm_addr(net, in_dev, sip, tip, scope); } static int arp_accept(struct in_device *in_dev, __be32 sip) { struct net *net = dev_net(in_dev->dev); int scope = RT_SCOPE_LINK; switch (IN_DEV_ARP_ACCEPT(in_dev)) { case 0: /* Don't create new entries from garp */ return 0; case 1: /* Create new entries from garp */ return 1; case 2: /* Create a neighbor in the arp table only if sip * is in the same subnet as an address configured * on the interface that received the garp message */ return !!inet_confirm_addr(net, in_dev, sip, 0, scope); default: return 0; } } static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) { struct rtable *rt; int flag = 0; /*unsigned long now; */ struct net *net = dev_net(dev); rt = ip_route_output(net, sip, tip, 0, l3mdev_master_ifindex_rcu(dev), RT_SCOPE_UNIVERSE); if (IS_ERR(rt)) return 1; if (rt->dst.dev != dev) { __NET_INC_STATS(net, LINUX_MIB_ARPFILTER); flag = 1; } ip_rt_put(rt); return flag; } /* * Check if we can use proxy ARP for this path */ static inline int arp_fwd_proxy(struct in_device *in_dev, struct net_device *dev, struct rtable *rt) { struct in_device *out_dev; int imi, omi = -1; if (rt->dst.dev == dev) return 0; if (!IN_DEV_PROXY_ARP(in_dev)) return 0; imi = IN_DEV_MEDIUM_ID(in_dev); if (imi == 0) return 1; if (imi == -1) return 0; /* place to check for proxy_arp for routes */ out_dev = __in_dev_get_rcu(rt->dst.dev); if (out_dev) omi = IN_DEV_MEDIUM_ID(out_dev); return omi != imi && omi != -1; } /* * Check for RFC3069 proxy arp private VLAN (allow to send back to same dev) * * RFC3069 supports proxy arp replies back to the same interface. This * is done to support (ethernet) switch features, like RFC 3069, where * the individual ports are not allowed to communicate with each * other, BUT they are allowed to talk to the upstream router. As * described in RFC 3069, it is possible to allow these hosts to * communicate through the upstream router, by proxy_arp'ing. * * RFC 3069: "VLAN Aggregation for Efficient IP Address Allocation" * * This technology is known by different names: * In RFC 3069 it is called VLAN Aggregation. * Cisco and Allied Telesyn call it Private VLAN. * Hewlett-Packard call it Source-Port filtering or port-isolation. * Ericsson call it MAC-Forced Forwarding (RFC Draft). * */ static inline int arp_fwd_pvlan(struct in_device *in_dev, struct net_device *dev, struct rtable *rt, __be32 sip, __be32 tip) { /* Private VLAN is only concerned about the same ethernet segment */ if (rt->dst.dev != dev) return 0; /* Don't reply on self probes (often done by windowz boxes)*/ if (sip == tip) return 0; if (IN_DEV_PROXY_ARP_PVLAN(in_dev)) return 1; else return 0; } /* * Interface to link layer: send routine and receive handler. */ /* * Create an arp packet. If dest_hw is not set, we create a broadcast * message. */ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw) { struct sk_buff *skb; struct arphdr *arp; unsigned char *arp_ptr; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; /* * Allocate a buffer */ skb = alloc_skb(arp_hdr_len(dev) + hlen + tlen, GFP_ATOMIC); if (!skb) return NULL; skb_reserve(skb, hlen); skb_reset_network_header(skb); arp = skb_put(skb, arp_hdr_len(dev)); skb->dev = dev; skb->protocol = htons(ETH_P_ARP); if (!src_hw) src_hw = dev->dev_addr; if (!dest_hw) dest_hw = dev->broadcast; /* * Fill the device header for the ARP frame */ if (dev_hard_header(skb, dev, ptype, dest_hw, src_hw, skb->len) < 0) goto out; /* * Fill out the arp protocol part. * * The arp hardware type should match the device type, except for FDDI, * which (according to RFC 1390) should always equal 1 (Ethernet). */ /* * Exceptions everywhere. AX.25 uses the AX.25 PID value not the * DIX code for the protocol. Make these device structure fields. */ switch (dev->type) { default: arp->ar_hrd = htons(dev->type); arp->ar_pro = htons(ETH_P_IP); break; #if IS_ENABLED(CONFIG_AX25) case ARPHRD_AX25: arp->ar_hrd = htons(ARPHRD_AX25); arp->ar_pro = htons(AX25_P_IP); break; #if IS_ENABLED(CONFIG_NETROM) case ARPHRD_NETROM: arp->ar_hrd = htons(ARPHRD_NETROM); arp->ar_pro = htons(AX25_P_IP); break; #endif #endif #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: arp->ar_hrd = htons(ARPHRD_ETHER); arp->ar_pro = htons(ETH_P_IP); break; #endif } arp->ar_hln = dev->addr_len; arp->ar_pln = 4; arp->ar_op = htons(type); arp_ptr = (unsigned char *)(arp + 1); memcpy(arp_ptr, src_hw, dev->addr_len); arp_ptr += dev->addr_len; memcpy(arp_ptr, &src_ip, 4); arp_ptr += 4; switch (dev->type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: if (target_hw) memcpy(arp_ptr, target_hw, dev->addr_len); else memset(arp_ptr, 0, dev->addr_len); arp_ptr += dev->addr_len; } memcpy(arp_ptr, &dest_ip, 4); return skb; out: kfree_skb(skb); return NULL; } EXPORT_SYMBOL(arp_create); static int arp_xmit_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { return dev_queue_xmit(skb); } /* * Send an arp packet. */ void arp_xmit(struct sk_buff *skb) { rcu_read_lock(); /* Send it off, maybe filter it using firewalling first. */ NF_HOOK(NFPROTO_ARP, NF_ARP_OUT, dev_net_rcu(skb->dev), NULL, skb, NULL, skb->dev, arp_xmit_finish); rcu_read_unlock(); } EXPORT_SYMBOL(arp_xmit); static bool arp_is_garp(struct net *net, struct net_device *dev, int *addr_type, __be16 ar_op, __be32 sip, __be32 tip, unsigned char *sha, unsigned char *tha) { bool is_garp = tip == sip; /* Gratuitous ARP _replies_ also require target hwaddr to be * the same as source. */ if (is_garp && ar_op == htons(ARPOP_REPLY)) is_garp = /* IPv4 over IEEE 1394 doesn't provide target * hardware address field in its ARP payload. */ tha && !memcmp(tha, sha, dev->addr_len); if (is_garp) { *addr_type = inet_addr_type_dev_table(net, dev, sip); if (*addr_type != RTN_UNICAST) is_garp = false; } return is_garp; } /* * Process an arp request. */ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) { struct net_device *dev = skb->dev; struct in_device *in_dev = __in_dev_get_rcu(dev); struct arphdr *arp; unsigned char *arp_ptr; struct rtable *rt; unsigned char *sha; unsigned char *tha = NULL; __be32 sip, tip; u16 dev_type = dev->type; int addr_type; struct neighbour *n; struct dst_entry *reply_dst = NULL; bool is_garp = false; /* arp_rcv below verifies the ARP header and verifies the device * is ARP'able. */ if (!in_dev) goto out_free_skb; arp = arp_hdr(skb); switch (dev_type) { default: if (arp->ar_pro != htons(ETH_P_IP) || htons(dev_type) != arp->ar_hrd) goto out_free_skb; break; case ARPHRD_ETHER: case ARPHRD_FDDI: case ARPHRD_IEEE802: /* * ETHERNET, and Fibre Channel (which are IEEE 802 * devices, according to RFC 2625) devices will accept ARP * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). * This is the case also of FDDI, where the RFC 1390 says that * FDDI devices should accept ARP hardware of (1) Ethernet, * however, to be more robust, we'll accept both 1 (Ethernet) * or 6 (IEEE 802.2) */ if ((arp->ar_hrd != htons(ARPHRD_ETHER) && arp->ar_hrd != htons(ARPHRD_IEEE802)) || arp->ar_pro != htons(ETH_P_IP)) goto out_free_skb; break; case ARPHRD_AX25: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_AX25)) goto out_free_skb; break; case ARPHRD_NETROM: if (arp->ar_pro != htons(AX25_P_IP) || arp->ar_hrd != htons(ARPHRD_NETROM)) goto out_free_skb; break; } /* Understand only these message types */ if (arp->ar_op != htons(ARPOP_REPLY) && arp->ar_op != htons(ARPOP_REQUEST)) goto out_free_skb; /* * Extract fields */ arp_ptr = (unsigned char *)(arp + 1); sha = arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); arp_ptr += 4; switch (dev_type) { #if IS_ENABLED(CONFIG_FIREWIRE_NET) case ARPHRD_IEEE1394: break; #endif default: tha = arp_ptr; arp_ptr += dev->addr_len; } memcpy(&tip, arp_ptr, 4); /* * Check for bad requests for 127.x.x.x and requests for multicast * addresses. If this is one such, delete it. */ if (ipv4_is_multicast(tip) || (!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip))) goto out_free_skb; /* * For some 802.11 wireless deployments (and possibly other networks), * there will be an ARP proxy and gratuitous ARP frames are attacks * and thus should not be accepted. */ if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP)) goto out_free_skb; /* * Special case: We must set Frame Relay source Q.922 address */ if (dev_type == ARPHRD_DLCI) sha = dev->broadcast; /* * Process entry. The idea here is we want to send a reply if it is a * request for us or if it is a request for someone else that we hold * a proxy for. We want to add an entry to our cache if it is a reply * to us or if it is a request for our address. * (The assumption for this last is that if someone is requesting our * address, they are probably intending to talk to us, so it saves time * if we cache their address. Their address is also probably not in * our cache, since ours is not in their cache.) * * Putting this another way, we only care about replies if they are to * us, in which case we add them to the cache. For requests, we care * about those for us and those for our proxies. We reply to both, * and in the case of requests for us we add the requester to the arp * cache. */ if (arp->ar_op == htons(ARPOP_REQUEST) && skb_metadata_dst(skb)) reply_dst = (struct dst_entry *) iptunnel_metadata_reply(skb_metadata_dst(skb), GFP_ATOMIC); /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL && !arp_ignore(in_dev, sip, tip)) arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); goto out_consume_skb; } if (arp->ar_op == htons(ARPOP_REQUEST) && ip_route_input_noref(skb, tip, sip, 0, dev) == 0) { rt = skb_rtable(skb); addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { int dont_send; dont_send = arp_ignore(in_dev, sip, tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) dont_send = arp_filter(sip, tip, dev); if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); neigh_release(n); } } goto out_consume_skb; } else if (IN_DEV_FORWARD(in_dev)) { if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || (rt->dst.dev != dev && pneigh_lookup(&arp_tbl, net, &tip, dev)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || NEIGH_VAR(in_dev->arp_parms, PROXY_DELAY) == 0) { arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha, dev->dev_addr, sha, reply_dst); } else { pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); goto out_free_dst; } goto out_consume_skb; } } } /* Update our ARP tables */ n = __neigh_lookup(&arp_tbl, &sip, dev, 0); addr_type = -1; if (n || arp_accept(in_dev, sip)) { is_garp = arp_is_garp(net, dev, &addr_type, arp->ar_op, sip, tip, sha, tha); } if (arp_accept(in_dev, sip)) { /* Unsolicited ARP is not accepted by default. It is possible, that this option should be enabled for some devices (strip is candidate) */ if (!n && (is_garp || (arp->ar_op == htons(ARPOP_REPLY) && (addr_type == RTN_UNICAST || (addr_type < 0 && /* postpone calculation to as late as possible */ inet_addr_type_dev_table(net, dev, sip) == RTN_UNICAST))))) n = __neigh_lookup(&arp_tbl, &sip, dev, 1); } if (n) { int state = NUD_REACHABLE; int override; /* If several different ARP replies follows back-to-back, use the FIRST one. It is possible, if several proxy agents are active. Taking the first reply prevents arp trashing and chooses the fastest router. */ override = time_after(jiffies, n->updated + NEIGH_VAR(n->parms, LOCKTIME)) || is_garp; /* Broadcast replies and request packets do not assert neighbour reachability. */ if (arp->ar_op != htons(ARPOP_REPLY) || skb->pkt_type != PACKET_HOST) state = NUD_STALE; neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0); neigh_release(n); } out_consume_skb: consume_skb(skb); out_free_dst: dst_release(reply_dst); return NET_RX_SUCCESS; out_free_skb: kfree_skb(skb); return NET_RX_DROP; } static void parp_redo(struct sk_buff *skb) { arp_process(dev_net(skb->dev), NULL, skb); } static int arp_is_multicast(const void *pkey) { return ipv4_is_multicast(*((__be32 *)pkey)); } /* * Receive an arp request from the device layer. */ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason; const struct arphdr *arp; /* do not tweak dropwatch on an ARP we will ignore */ if (dev->flags & IFF_NOARP || skb->pkt_type == PACKET_OTHERHOST || skb->pkt_type == PACKET_LOOPBACK) goto consumeskb; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) goto out_of_mem; /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ drop_reason = pskb_may_pull_reason(skb, arp_hdr_len(dev)); if (drop_reason != SKB_NOT_DROPPED_YET) goto freeskb; arp = arp_hdr(skb); if (arp->ar_hln != dev->addr_len || arp->ar_pln != 4) { drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; goto freeskb; } memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); return NF_HOOK(NFPROTO_ARP, NF_ARP_IN, dev_net(dev), NULL, skb, dev, NULL, arp_process); consumeskb: consume_skb(skb); return NET_RX_SUCCESS; freeskb: kfree_skb_reason(skb, drop_reason); out_of_mem: return NET_RX_DROP; } /* * User level interface (ioctl) */ static struct net_device *arp_req_dev_by_name(struct net *net, struct arpreq *r, bool getarp) { struct net_device *dev; if (getarp) dev = dev_get_by_name_rcu(net, r->arp_dev); else dev = __dev_get_by_name(net, r->arp_dev); if (!dev) return ERR_PTR(-ENODEV); /* Mmmm... It is wrong... ARPHRD_NETROM == 0 */ if (!r->arp_ha.sa_family) r->arp_ha.sa_family = dev->type; if ((r->arp_flags & ATF_COM) && r->arp_ha.sa_family != dev->type) return ERR_PTR(-EINVAL); return dev; } static struct net_device *arp_req_dev(struct net *net, struct arpreq *r) { struct net_device *dev; struct rtable *rt; __be32 ip; if (r->arp_dev[0]) return arp_req_dev_by_name(net, r, false); if (r->arp_flags & ATF_PUBL) return NULL; ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; rt = ip_route_output(net, ip, 0, 0, 0, RT_SCOPE_LINK); if (IS_ERR(rt)) return ERR_CAST(rt); dev = rt->dst.dev; ip_rt_put(rt); if (!dev) return ERR_PTR(-EINVAL); return dev; } /* * Set (create) an ARP cache entry. */ static int arp_req_set_proxy(struct net *net, struct net_device *dev, int on) { if (!dev) { IPV4_DEVCONF_ALL(net, PROXY_ARP) = on; return 0; } if (__in_dev_get_rtnl_net(dev)) { IN_DEV_CONF_SET(__in_dev_get_rtnl_net(dev), PROXY_ARP, on); return 0; } return -ENXIO; } static int arp_req_set_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, r->arp_ha.sa_data); if (!dev) return -ENODEV; } if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return pneigh_create(&arp_tbl, net, &ip, dev, 0, 0, false); } return arp_req_set_proxy(net, dev, 1); } static int arp_req_set(struct net *net, struct arpreq *r) { struct neighbour *neigh; struct net_device *dev; __be32 ip; int err; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_set_public(net, r, dev); switch (dev->type) { #if IS_ENABLED(CONFIG_FDDI) case ARPHRD_FDDI: /* * According to RFC 1390, FDDI devices should accept ARP * hardware types of 1 (Ethernet). However, to be more * robust, we'll accept hardware types of either 1 (Ethernet) * or 6 (IEEE 802.2). */ if (r->arp_ha.sa_family != ARPHRD_FDDI && r->arp_ha.sa_family != ARPHRD_ETHER && r->arp_ha.sa_family != ARPHRD_IEEE802) return -EINVAL; break; #endif default: if (r->arp_ha.sa_family != dev->type) return -EINVAL; break; } ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); err = PTR_ERR(neigh); if (!IS_ERR(neigh)) { unsigned int state = NUD_STALE; if (r->arp_flags & ATF_PERM) { r->arp_flags |= ATF_COM; state = NUD_PERMANENT; } err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); } return err; } static unsigned int arp_state_to_flags(struct neighbour *neigh) { if (neigh->nud_state&NUD_PERMANENT) return ATF_PERM | ATF_COM; else if (neigh->nud_state&NUD_VALID) return ATF_COM; else return 0; } /* * Get an ARP cache entry. */ static int arp_req_get(struct net *net, struct arpreq *r) { __be32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; struct neighbour *neigh; struct net_device *dev; if (!r->arp_dev[0]) return -ENODEV; dev = arp_req_dev_by_name(net, r, true); if (IS_ERR(dev)) return PTR_ERR(dev); neigh = neigh_lookup(&arp_tbl, &ip, dev); if (!neigh) return -ENXIO; if (READ_ONCE(neigh->nud_state) & NUD_NOARP) { neigh_release(neigh); return -ENXIO; } read_lock_bh(&neigh->lock); memcpy(r->arp_ha.sa_data, neigh->ha, min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); r->arp_flags = arp_state_to_flags(neigh); read_unlock_bh(&neigh->lock); neigh_release(neigh); r->arp_ha.sa_family = dev->type; netdev_copy_name(dev, r->arp_dev); return 0; } int arp_invalidate(struct net_device *dev, __be32 ip, bool force) { struct neighbour *neigh = neigh_lookup(&arp_tbl, &ip, dev); int err = -ENXIO; struct neigh_table *tbl = &arp_tbl; if (neigh) { if ((READ_ONCE(neigh->nud_state) & NUD_VALID) && !force) { neigh_release(neigh); return 0; } if (READ_ONCE(neigh->nud_state) & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN, 0); write_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); write_unlock_bh(&tbl->lock); } return err; } static int arp_req_delete_public(struct net *net, struct arpreq *r, struct net_device *dev) { __be32 mask = ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; if (mask) { __be32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return pneigh_delete(&arp_tbl, net, &ip, dev); } return arp_req_set_proxy(net, dev, 0); } static int arp_req_delete(struct net *net, struct arpreq *r) { struct net_device *dev; __be32 ip; dev = arp_req_dev(net, r); if (IS_ERR(dev)) return PTR_ERR(dev); if (r->arp_flags & ATF_PUBL) return arp_req_delete_public(net, r, dev); ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; return arp_invalidate(dev, ip, true); } /* * Handle an ARP layer I/O control request. */ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct arpreq r; __be32 *netmask; int err; switch (cmd) { case SIOCDARP: case SIOCSARP: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; fallthrough; case SIOCGARP: err = copy_from_user(&r, arg, sizeof(struct arpreq)); if (err) return -EFAULT; break; default: return -EINVAL; } if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; if (!(r.arp_flags & ATF_PUBL) && (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; netmask = &((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr; if (!(r.arp_flags & ATF_NETMASK)) *netmask = htonl(0xFFFFFFFFUL); else if (*netmask && *netmask != htonl(0xFFFFFFFFUL)) return -EINVAL; switch (cmd) { case SIOCDARP: rtnl_net_lock(net); err = arp_req_delete(net, &r); rtnl_net_unlock(net); break; case SIOCSARP: rtnl_net_lock(net); err = arp_req_set(net, &r); rtnl_net_unlock(net); break; case SIOCGARP: rcu_read_lock(); err = arp_req_get(net, &r); rcu_read_unlock(); if (!err && copy_to_user(arg, &r, sizeof(r))) err = -EFAULT; break; } return err; } static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_change_info *change_info; struct in_device *in_dev; bool evict_nocarrier; switch (event) { case NETDEV_CHANGEADDR: neigh_changeaddr(&arp_tbl, dev); rt_cache_flush(dev_net(dev)); break; case NETDEV_CHANGE: change_info = ptr; if (change_info->flags_changed & IFF_NOARP) neigh_changeaddr(&arp_tbl, dev); in_dev = __in_dev_get_rtnl(dev); if (!in_dev) evict_nocarrier = true; else evict_nocarrier = IN_DEV_ARP_EVICT_NOCARRIER(in_dev); if (evict_nocarrier && !netif_carrier_ok(dev)) neigh_carrier_down(&arp_tbl, dev); break; default: break; } return NOTIFY_DONE; } static struct notifier_block arp_netdev_notifier = { .notifier_call = arp_netdev_event, }; /* Note, that it is not on notifier chain. It is necessary, that this routine was called after route cache will be flushed. */ void arp_ifdown(struct net_device *dev) { neigh_ifdown(&arp_tbl, dev); } /* * Called once on startup. */ static struct packet_type arp_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_ARP), .func = arp_rcv, }; #ifdef CONFIG_PROC_FS #if IS_ENABLED(CONFIG_AX25) /* * ax25 -> ASCII conversion */ static void ax2asc2(ax25_address *a, char *buf) { char c, *s; int n; for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; if (c != ' ') *s++ = c; } *s++ = '-'; n = (a->ax25_call[6] >> 1) & 0x0F; if (n > 9) { *s++ = '1'; n -= 10; } *s++ = n + '0'; *s++ = '\0'; if (*buf == '\0' || *buf == '-') { buf[0] = '*'; buf[1] = '\0'; } } #endif /* CONFIG_AX25 */ #define HBUFFERLEN 30 static void arp_format_neigh_entry(struct seq_file *seq, struct neighbour *n) { char hbuffer[HBUFFERLEN]; int k, j; char tbuf[16]; struct net_device *dev = n->dev; int hatype = dev->type; read_lock(&n->lock); /* Convert hardware address to XX:XX:XX:XX ... form. */ #if IS_ENABLED(CONFIG_AX25) if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) ax2asc2((ax25_address *)n->ha, hbuffer); else { #endif for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) { hbuffer[k++] = hex_asc_hi(n->ha[j]); hbuffer[k++] = hex_asc_lo(n->ha[j]); hbuffer[k++] = ':'; } if (k != 0) --k; hbuffer[k] = 0; #if IS_ENABLED(CONFIG_AX25) } #endif sprintf(tbuf, "%pI4", n->primary_key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%-17s * %s\n", tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); read_unlock(&n->lock); } static void arp_format_pneigh_entry(struct seq_file *seq, struct pneigh_entry *n) { struct net_device *dev = n->dev; int hatype = dev ? dev->type : 0; char tbuf[16]; sprintf(tbuf, "%pI4", n->key); seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", dev ? dev->name : "*"); } static int arp_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, "IP address HW type Flags " "HW address Mask Device\n"); } else { struct neigh_seq_state *state = seq->private; if (state->flags & NEIGH_SEQ_IS_PNEIGH) arp_format_pneigh_entry(seq, v); else arp_format_neigh_entry(seq, v); } return 0; } static void *arp_seq_start(struct seq_file *seq, loff_t *pos) { /* Don't want to confuse "arp -a" w/ magic entries, * so we tell the generic iterator to skip NUD_NOARP. */ return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP); } static const struct seq_operations arp_seq_ops = { .start = arp_seq_start, .next = neigh_seq_next, .stop = neigh_seq_stop, .show = arp_seq_show, }; #endif /* CONFIG_PROC_FS */ static int __net_init arp_net_init(struct net *net) { if (!proc_create_net("arp", 0444, net->proc_net, &arp_seq_ops, sizeof(struct neigh_seq_state))) return -ENOMEM; return 0; } static void __net_exit arp_net_exit(struct net *net) { remove_proc_entry("arp", net->proc_net); } static struct pernet_operations arp_net_ops = { .init = arp_net_init, .exit = arp_net_exit, }; void __init arp_init(void) { neigh_table_init(NEIGH_ARP_TABLE, &arp_tbl); dev_add_pack(&arp_packet_type); register_pernet_subsys(&arp_net_ops); #ifdef CONFIG_SYSCTL neigh_sysctl_register(NULL, &arp_tbl.parms, NULL); #endif register_netdevice_notifier(&arp_netdev_notifier); }
832 831 5 833 2 831 61 63 720 712 5 708 11 720 696 25 25 720 1 719 681 39 3 717 717 3 696 25 25 117 118 117 1 110 8 118 98 20 20 118 118 1 117 90 28 118 117 1 98 20 20 111 111 111 101 101 101 2 2 2 3 3 574 3 572 108 108 106 9 108 3 16 108 11 11 11 2 10 10 10 11 11 11 6 2 14 106 15 90 90 27 62 53 27 26 36 9 9 9 1 1 1 25 25 14 14 14 5 5 19 19 19 147 845 4 848 3 1 833 904 879 837 428 549 547 907 409 904 813 572 571 556 122 57 121 4 56 119 121 7 121 118 118 118 102 102 102 102 102 29 10 20 28 1 8 20 1 127 29 126 20 20 20 127 127 127 127 126 126 126 126 126 126 125 1 121 5 9 121 36 126 126 29 36 35 92 26 118 118 101 111 124 2 28 125 20 20 20 753 751 751 2 2 2 14 10 10 6 3 1 2 2 6 1 1 3 3 1 2 1 2 11 11 1 1 14 14 3 5 2 14 1 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright IBM Corp. 2001, 2004 * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2002 Intel Corp. * * This file is part of the SCTP kernel implementation * * These functions work with the state functions in sctp_sm_statefuns.c * to implement the state operations. These functions implement the * steps which require modifying existing data structures. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * C. Robin <chris@hundredacre.ac.uk> * Jon Grimm <jgrimm@us.ibm.com> * Xingang Guo <xingang.guo@intel.com> * Dajiang Zhang <dajiang.zhang@nokia.com> * Sridhar Samudrala <sri@us.ibm.com> * Daisy Chang <daisyc@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> * Kevin Gao <kevin.gao@intel.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/utils.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/skbuff.h> #include <linux/random.h> /* for get_random_bytes */ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp); static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp); static struct sctp_cookie_param *sctp_pack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, const __u8 *raw_addrs, int addrs_len); static int sctp_process_param(struct sctp_association *asoc, union sctp_params param, const union sctp_addr *peer_addr, gfp_t gfp); static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data); /* Control chunk destructor */ static void sctp_control_release_owner(struct sk_buff *skb) { struct sctp_chunk *chunk = skb_shinfo(skb)->destructor_arg; if (chunk->shkey) { struct sctp_shared_key *shkey = chunk->shkey; struct sctp_association *asoc = chunk->asoc; /* refcnt == 2 and !list_empty mean after this release, it's * not being used anywhere, and it's time to notify userland * that this shkey can be freed if it's been deactivated. */ if (shkey->deactivated && !list_empty(&shkey->key_list) && refcount_read(&shkey->refcnt) == 2) { struct sctp_ulpevent *ev; ev = sctp_ulpevent_make_authkey(asoc, shkey->key_id, SCTP_AUTH_FREE_KEY, GFP_KERNEL); if (ev) asoc->stream.si->enqueue_event(&asoc->ulpq, ev); } sctp_auth_shkey_release(chunk->shkey); } } static void sctp_control_set_owner_w(struct sctp_chunk *chunk) { struct sctp_association *asoc = chunk->asoc; struct sk_buff *skb = chunk->skb; /* TODO: properly account for control chunks. * To do it right we'll need: * 1) endpoint if association isn't known. * 2) proper memory accounting. * * For now don't do anything for now. */ if (chunk->auth) { chunk->shkey = asoc->shkey; sctp_auth_shkey_hold(chunk->shkey); } skb->sk = asoc ? asoc->base.sk : NULL; skb_shinfo(skb)->destructor_arg = chunk; skb->destructor = sctp_control_release_owner; } /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 2: The ECN capable field is reserved for future use of * Explicit Congestion Notification. */ static const struct sctp_paramhdr ecap_param = { SCTP_PARAM_ECN_CAPABLE, cpu_to_be16(sizeof(struct sctp_paramhdr)), }; static const struct sctp_paramhdr prsctp_param = { SCTP_PARAM_FWD_TSN_SUPPORT, cpu_to_be16(sizeof(struct sctp_paramhdr)), }; /* A helper to initialize an op error inside a provided chunk, as most * cause codes will be embedded inside an abort chunk. */ int sctp_init_cause(struct sctp_chunk *chunk, __be16 cause_code, size_t paylen) { struct sctp_errhdr err; __u16 len; /* Cause code constants are now defined in network order. */ err.cause = cause_code; len = sizeof(err) + paylen; err.length = htons(len); if (skb_tailroom(chunk->skb) < len) return -ENOSPC; chunk->subh.err_hdr = sctp_addto_chunk(chunk, sizeof(err), &err); return 0; } /* 3.3.2 Initiation (INIT) (1) * * This chunk is used to initiate a SCTP association between two * endpoints. The format of the INIT chunk is shown below: * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 1 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initiate Tag | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Advertised Receiver Window Credit (a_rwnd) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Number of Outbound Streams | Number of Inbound Streams | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Initial TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Optional/Variable-Length Parameters / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * * The INIT chunk contains the following parameters. Unless otherwise * noted, each parameter MUST only be included once in the INIT chunk. * * Fixed Parameters Status * ---------------------------------------------- * Initiate Tag Mandatory * Advertised Receiver Window Credit Mandatory * Number of Outbound Streams Mandatory * Number of Inbound Streams Mandatory * Initial TSN Mandatory * * Variable Parameters Status Type Value * ------------------------------------------------------------- * IPv4 Address (Note 1) Optional 5 * IPv6 Address (Note 1) Optional 6 * Cookie Preservative Optional 9 * Reserved for ECN Capable (Note 2) Optional 32768 (0x8000) * Host Name Address (Note 3) Optional 11 * Supported Address Types (Note 4) Optional 12 */ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, const struct sctp_bind_addr *bp, gfp_t gfp, int vparam_len) { struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; struct sctp_paramhdr *auth_hmacs = NULL; struct sctp_supported_addrs_param sat; struct sctp_endpoint *ep = asoc->ep; struct sctp_chunk *retval = NULL; int num_types, addrs_len = 0; struct sctp_inithdr init; union sctp_params addrs; struct sctp_sock *sp; __u8 extensions[5]; size_t chunksize; __be16 types[2]; int num_ext = 0; /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 1: The INIT chunks can contain multiple addresses that * can be IPv4 and/or IPv6 in any combination. */ /* Convert the provided bind address list to raw format. */ addrs = sctp_bind_addrs_to_raw(bp, &addrs_len, gfp); init.init_tag = htonl(asoc->c.my_vtag); init.a_rwnd = htonl(asoc->rwnd); init.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); init.num_inbound_streams = htons(asoc->c.sinit_max_instreams); init.initial_tsn = htonl(asoc->c.initial_tsn); /* How many address types are needed? */ sp = sctp_sk(asoc->base.sk); num_types = sp->pf->supported_addrs(sp, types); chunksize = sizeof(init) + addrs_len; chunksize += SCTP_PAD4(SCTP_SAT_LEN(num_types)); if (asoc->ep->ecn_enable) chunksize += sizeof(ecap_param); if (asoc->ep->prsctp_enable) chunksize += sizeof(prsctp_param); /* ADDIP: Section 4.2.7: * An implementation supporting this extension [ADDIP] MUST list * the ASCONF,the ASCONF-ACK, and the AUTH chunks in its INIT and * INIT-ACK parameters. */ if (asoc->ep->asconf_enable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } if (asoc->ep->reconf_enable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } if (sp->adaptation_ind) chunksize += sizeof(aiparam); if (asoc->ep->intl_enable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } chunksize += vparam_len; /* Account for AUTH related parameters */ if (ep->auth_enable) { /* Add random parameter length*/ chunksize += sizeof(asoc->c.auth_random); /* Add HMACS parameter length if any were defined */ auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs; if (auth_hmacs->length) chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; /* Add CHUNKS parameter length */ auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks; if (auth_chunks->length) chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; extensions[num_ext] = SCTP_CID_AUTH; num_ext += 1; } /* If we have any extensions to report, account for that */ if (num_ext) chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 3: An INIT chunk MUST NOT contain more than one Host * Name address parameter. Moreover, the sender of the INIT * MUST NOT combine any other address types with the Host Name * address in the INIT. The receiver of INIT MUST ignore any * other address types if the Host Name address parameter is * present in the received INIT chunk. * * PLEASE DO NOT FIXME [This version does not support Host Name.] */ retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize, gfp); if (!retval) goto nodata; retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(init), &init); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); /* RFC 2960 3.3.2 Initiation (INIT) (1) * * Note 4: This parameter, when present, specifies all the * address types the sending endpoint can support. The absence * of this parameter indicates that the sending endpoint can * support any address type. */ sat.param_hdr.type = SCTP_PARAM_SUPPORTED_ADDRESS_TYPES; sat.param_hdr.length = htons(SCTP_SAT_LEN(num_types)); sctp_addto_chunk(retval, sizeof(sat), &sat); sctp_addto_chunk(retval, num_types * sizeof(__u16), &types); if (asoc->ep->ecn_enable) sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); /* Add the supported extensions parameter. Be nice and add this * fist before addiding the parameters for the extensions themselves */ if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->ep->prsctp_enable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; aiparam.param_hdr.length = htons(sizeof(aiparam)); aiparam.adaptation_ind = htonl(sp->adaptation_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); } /* Add SCTP-AUTH chunks to the parameter list */ if (ep->auth_enable) { sctp_addto_chunk(retval, sizeof(asoc->c.auth_random), asoc->c.auth_random); if (auth_hmacs) sctp_addto_chunk(retval, ntohs(auth_hmacs->length), auth_hmacs); if (auth_chunks) sctp_addto_chunk(retval, ntohs(auth_chunks->length), auth_chunks); } nodata: kfree(addrs.v); return retval; } struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, gfp_t gfp, int unkparam_len) { struct sctp_supported_ext_param ext_param; struct sctp_adaptation_ind_param aiparam; struct sctp_paramhdr *auth_chunks = NULL; struct sctp_paramhdr *auth_random = NULL; struct sctp_paramhdr *auth_hmacs = NULL; struct sctp_chunk *retval = NULL; struct sctp_cookie_param *cookie; struct sctp_inithdr initack; union sctp_params addrs; struct sctp_sock *sp; __u8 extensions[5]; size_t chunksize; int num_ext = 0; int cookie_len; int addrs_len; /* Note: there may be no addresses to embed. */ addrs = sctp_bind_addrs_to_raw(&asoc->base.bind_addr, &addrs_len, gfp); initack.init_tag = htonl(asoc->c.my_vtag); initack.a_rwnd = htonl(asoc->rwnd); initack.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); initack.num_inbound_streams = htons(asoc->c.sinit_max_instreams); initack.initial_tsn = htonl(asoc->c.initial_tsn); /* FIXME: We really ought to build the cookie right * into the packet instead of allocating more fresh memory. */ cookie = sctp_pack_cookie(asoc->ep, asoc, chunk, &cookie_len, addrs.v, addrs_len); if (!cookie) goto nomem_cookie; /* Calculate the total size of allocation, include the reserved * space for reporting unknown parameters if it is specified. */ sp = sctp_sk(asoc->base.sk); chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len; /* Tell peer that we'll do ECN only if peer advertised such cap. */ if (asoc->peer.ecn_capable) chunksize += sizeof(ecap_param); if (asoc->peer.prsctp_capable) chunksize += sizeof(prsctp_param); if (asoc->peer.asconf_capable) { extensions[num_ext] = SCTP_CID_ASCONF; extensions[num_ext+1] = SCTP_CID_ASCONF_ACK; num_ext += 2; } if (asoc->peer.reconf_capable) { extensions[num_ext] = SCTP_CID_RECONF; num_ext += 1; } if (sp->adaptation_ind) chunksize += sizeof(aiparam); if (asoc->peer.intl_capable) { extensions[num_ext] = SCTP_CID_I_DATA; num_ext += 1; } if (asoc->peer.auth_capable) { auth_random = (struct sctp_paramhdr *)asoc->c.auth_random; chunksize += ntohs(auth_random->length); auth_hmacs = (struct sctp_paramhdr *)asoc->c.auth_hmacs; if (auth_hmacs->length) chunksize += SCTP_PAD4(ntohs(auth_hmacs->length)); else auth_hmacs = NULL; auth_chunks = (struct sctp_paramhdr *)asoc->c.auth_chunks; if (auth_chunks->length) chunksize += SCTP_PAD4(ntohs(auth_chunks->length)); else auth_chunks = NULL; extensions[num_ext] = SCTP_CID_AUTH; num_ext += 1; } if (num_ext) chunksize += SCTP_PAD4(sizeof(ext_param) + num_ext); /* Now allocate and fill out the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp); if (!retval) goto nomem_chunk; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it received the DATA or control chunk * to which it is replying. * * [INIT ACK back to where the INIT came from.] */ if (chunk->transport) retval->transport = sctp_assoc_lookup_paddr(asoc, &chunk->transport->ipaddr); retval->subh.init_hdr = sctp_addto_chunk(retval, sizeof(initack), &initack); retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); sctp_addto_chunk(retval, cookie_len, cookie); if (asoc->peer.ecn_capable) sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); if (num_ext) { ext_param.param_hdr.type = SCTP_PARAM_SUPPORTED_EXT; ext_param.param_hdr.length = htons(sizeof(ext_param) + num_ext); sctp_addto_chunk(retval, sizeof(ext_param), &ext_param); sctp_addto_param(retval, num_ext, extensions); } if (asoc->peer.prsctp_capable) sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); if (sp->adaptation_ind) { aiparam.param_hdr.type = SCTP_PARAM_ADAPTATION_LAYER_IND; aiparam.param_hdr.length = htons(sizeof(aiparam)); aiparam.adaptation_ind = htonl(sp->adaptation_ind); sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); } if (asoc->peer.auth_capable) { sctp_addto_chunk(retval, ntohs(auth_random->length), auth_random); if (auth_hmacs) sctp_addto_chunk(retval, ntohs(auth_hmacs->length), auth_hmacs); if (auth_chunks) sctp_addto_chunk(retval, ntohs(auth_chunks->length), auth_chunks); } /* We need to remove the const qualifier at this point. */ retval->asoc = (struct sctp_association *) asoc; nomem_chunk: kfree(cookie); nomem_cookie: kfree(addrs.v); return retval; } /* 3.3.11 Cookie Echo (COOKIE ECHO) (10): * * This chunk is used only during the initialization of an association. * It is sent by the initiator of an association to its peer to complete * the initialization process. This chunk MUST precede any DATA chunk * sent within the association, but MAY be bundled with one or more DATA * chunks in the same packet. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 10 |Chunk Flags | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / Cookie / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bit * * Set to zero on transmit and ignored on receipt. * * Length: 16 bits (unsigned integer) * * Set to the size of the chunk in bytes, including the 4 bytes of * the chunk header and the size of the Cookie. * * Cookie: variable size * * This field must contain the exact cookie received in the * State Cookie parameter from the previous INIT ACK. * * An implementation SHOULD make the cookie as small as possible * to insure interoperability. */ struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; int cookie_len; void *cookie; cookie = asoc->peer.cookie; cookie_len = asoc->peer.cookie_len; /* Build a cookie echo chunk. */ retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.cookie_hdr = sctp_addto_chunk(retval, cookie_len, cookie); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [COOKIE ECHO back to where the INIT ACK came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* 3.3.12 Cookie Acknowledgement (COOKIE ACK) (11): * * This chunk is used only during the initialization of an * association. It is used to acknowledge the receipt of a COOKIE * ECHO chunk. This chunk MUST precede any DATA or SACK chunk sent * within the association, but MAY be bundled with one or more DATA * chunks or SACK chunk in the same SCTP packet. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 11 |Chunk Flags | Length = 4 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Chunk Flags: 8 bits * * Set to zero on transmit and ignored on receipt. */ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [COOKIE ACK back to where the COOKIE ECHO came from.] */ if (retval && chunk && chunk->transport) retval->transport = sctp_assoc_lookup_paddr(asoc, &chunk->transport->ipaddr); return retval; } /* * Appendix A: Explicit Congestion Notification: * CWR: * * RFC 2481 details a specific bit for a sender to send in the header of * its next outbound TCP segment to indicate to its peer that it has * reduced its congestion window. This is termed the CWR bit. For * SCTP the same indication is made by including the CWR chunk. * This chunk contains one data element, i.e. the TSN number that * was sent in the ECNE chunk. This element represents the lowest * TSN number in the datagram that was originally marked with the * CE bit. * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Chunk Type=13 | Flags=00000000| Chunk Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Lowest TSN Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Note: The CWR is considered a Control chunk. */ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, const __u32 lowest_tsn, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; struct sctp_cwrhdr cwr; cwr.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0, sizeof(cwr), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecn_cwr_hdr = sctp_addto_chunk(retval, sizeof(cwr), &cwr); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [Report a reduced congestion window back to where the ECNE * came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* Make an ECNE chunk. This is a congestion experienced report. */ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, const __u32 lowest_tsn) { struct sctp_chunk *retval; struct sctp_ecnehdr ecne; ecne.lowest_tsn = htonl(lowest_tsn); retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0, sizeof(ecne), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.ecne_hdr = sctp_addto_chunk(retval, sizeof(ecne), &ecne); nodata: return retval; } /* Make a DATA chunk for the given association from the provided * parameters. However, do not populate the data payload. */ struct sctp_chunk *sctp_make_datafrag_empty(const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_datahdr dp; /* We assign the TSN as LATE as possible, not here when * creating the chunk. */ memset(&dp, 0, sizeof(dp)); dp.ppid = sinfo->sinfo_ppid; dp.stream = htons(sinfo->sinfo_stream); /* Set the flags for an unordered send. */ if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_data(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } /* Create a selective ackowledgement (SACK) for the given * association. This reports on which TSN's we've seen to date, * including duplicates and gaps. */ struct sctp_chunk *sctp_make_sack(struct sctp_association *asoc) { struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; struct sctp_gap_ack_block gabs[SCTP_MAX_GABS]; __u16 num_gabs, num_dup_tsns; struct sctp_transport *trans; struct sctp_chunk *retval; struct sctp_sackhdr sack; __u32 ctsn; int len; memset(gabs, 0, sizeof(gabs)); ctsn = sctp_tsnmap_get_ctsn(map); pr_debug("%s: sackCTSNAck sent:0x%x\n", __func__, ctsn); /* How much room is needed in the chunk? */ num_gabs = sctp_tsnmap_num_gabs(map, gabs); num_dup_tsns = sctp_tsnmap_num_dups(map); /* Initialize the SACK header. */ sack.cum_tsn_ack = htonl(ctsn); sack.a_rwnd = htonl(asoc->a_rwnd); sack.num_gap_ack_blocks = htons(num_gabs); sack.num_dup_tsns = htons(num_dup_tsns); len = sizeof(sack) + sizeof(struct sctp_gap_ack_block) * num_gabs + sizeof(__u32) * num_dup_tsns; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len, GFP_ATOMIC); if (!retval) goto nodata; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, etc.) to the same destination transport * address from which it received the DATA or control chunk to * which it is replying. This rule should also be followed if * the endpoint is bundling DATA chunks together with the * reply chunk. * * However, when acknowledging multiple DATA chunks received * in packets from different source addresses in a single * SACK, the SACK chunk may be transmitted to one of the * destination transport addresses from which the DATA or * control chunks being acknowledged were received. * * [BUG: We do not implement the following paragraph. * Perhaps we should remember the last transport we used for a * SACK and avoid that (if possible) if we have seen any * duplicates. --piggy] * * When a receiver of a duplicate DATA chunk sends a SACK to a * multi- homed endpoint it MAY be beneficial to vary the * destination address and not use the source address of the * DATA chunk. The reason being that receiving a duplicate * from a multi-homed endpoint might indicate that the return * path (as specified in the source address of the DATA chunk) * for the SACK is broken. * * [Send to the address from which we last received a DATA chunk.] */ retval->transport = asoc->peer.last_data_from; retval->subh.sack_hdr = sctp_addto_chunk(retval, sizeof(sack), &sack); /* Add the gap ack block information. */ if (num_gabs) sctp_addto_chunk(retval, sizeof(__u32) * num_gabs, gabs); /* Add the duplicate TSN information. */ if (num_dup_tsns) { asoc->stats.idupchunks += num_dup_tsns; sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns, sctp_tsnmap_get_dups(map)); } /* Once we have a sack generated, check to see what our sack * generation is, if its 0, reset the transports to 0, and reset * the association generation to 1 * * The idea is that zero is never used as a valid generation for the * association so no transport will match after a wrap event like this, * Until the next sack */ if (++asoc->peer.sack_generation == 0) { list_for_each_entry(trans, &asoc->peer.transport_addr_list, transports) trans->sack_generation = 0; asoc->peer.sack_generation = 1; } nodata: return retval; } /* Make a SHUTDOWN chunk. */ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_shutdownhdr shut; struct sctp_chunk *retval; __u32 ctsn; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); shut.cum_tsn_ack = htonl(ctsn); retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0, sizeof(shut), GFP_ATOMIC); if (!retval) goto nodata; retval->subh.shutdown_hdr = sctp_addto_chunk(retval, sizeof(shut), &shut); if (chunk) retval->transport = chunk->transport; nodata: return retval; } struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ACK back to where the SHUTDOWN came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } struct sctp_chunk *sctp_make_shutdown_complete( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_chunk *retval; __u8 flags = 0; /* Set the T-bit if we have no association (vtag will be * reflected) */ flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T; retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [Report SHUTDOWN COMPLETE back to where the SHUTDOWN ACK * came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } /* Create an ABORT. Note that we set the T bit if we have no * association, except when responding to an INIT (sctpimpguide 2.41). */ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const size_t hint) { struct sctp_chunk *retval; __u8 flags = 0; /* Set the T-bit if we have no association and 'chunk' is not * an INIT (vtag will be reflected). */ if (!asoc) { if (chunk && chunk->chunk_hdr && chunk->chunk_hdr->type == SCTP_CID_INIT) flags = 0; else flags = SCTP_CHUNK_FLAG_T; } retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint, GFP_ATOMIC); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ABORT back to where the offender came from.] */ if (retval && chunk) retval->transport = chunk->transport; return retval; } /* Helper to create ABORT with a NO_USER_DATA error. */ struct sctp_chunk *sctp_make_abort_no_data( const struct sctp_association *asoc, const struct sctp_chunk *chunk, __u32 tsn) { struct sctp_chunk *retval; __be32 payload; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + sizeof(tsn)); if (!retval) goto no_mem; /* Put the tsn back into network byte order. */ payload = htonl(tsn); sctp_init_cause(retval, SCTP_ERROR_NO_DATA, sizeof(payload)); sctp_addto_chunk(retval, sizeof(payload), (const void *)&payload); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [ABORT back to where the offender came from.] */ if (chunk) retval->transport = chunk->transport; no_mem: return retval; } /* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error. */ struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc, struct msghdr *msg, size_t paylen) { struct sctp_chunk *retval; void *payload = NULL; int err; retval = sctp_make_abort(asoc, NULL, sizeof(struct sctp_errhdr) + paylen); if (!retval) goto err_chunk; if (paylen) { /* Put the msg_iov together into payload. */ payload = kmalloc(paylen, GFP_KERNEL); if (!payload) goto err_payload; err = memcpy_from_msg(payload, msg, paylen); if (err < 0) goto err_copy; } sctp_init_cause(retval, SCTP_ERROR_USER_ABORT, paylen); sctp_addto_chunk(retval, paylen, payload); if (paylen) kfree(payload); return retval; err_copy: kfree(payload); err_payload: sctp_chunk_free(retval); retval = NULL; err_chunk: return retval; } /* Append bytes to the end of a parameter. Will panic if chunk is not big * enough. */ static void *sctp_addto_param(struct sctp_chunk *chunk, int len, const void *data) { int chunklen = ntohs(chunk->chunk_hdr->length); void *target; target = skb_put(chunk->skb, len); if (data) memcpy(target, data, len); else memset(target, 0, len); /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } /* Make an ABORT chunk with a PROTOCOL VIOLATION cause code. */ struct sctp_chunk *sctp_make_abort_violation( const struct sctp_association *asoc, const struct sctp_chunk *chunk, const __u8 *payload, const size_t paylen) { struct sctp_chunk *retval; struct sctp_paramhdr phdr; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + paylen + sizeof(phdr)); if (!retval) goto end; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, paylen + sizeof(phdr)); phdr.type = htons(chunk->chunk_hdr->type); phdr.length = chunk->chunk_hdr->length; sctp_addto_chunk(retval, paylen, payload); sctp_addto_param(retval, sizeof(phdr), &phdr); end: return retval; } struct sctp_chunk *sctp_make_violation_paramlen( const struct sctp_association *asoc, const struct sctp_chunk *chunk, struct sctp_paramhdr *param) { static const char error[] = "The following parameter had invalid length:"; size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr) + sizeof(*param); struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, payload_len); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error) + sizeof(*param)); sctp_addto_chunk(retval, sizeof(error), error); sctp_addto_param(retval, sizeof(*param), param); nodata: return retval; } struct sctp_chunk *sctp_make_violation_max_retrans( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { static const char error[] = "Association exceeded its max_retrans count"; size_t payload_len = sizeof(error) + sizeof(struct sctp_errhdr); struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, payload_len); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, sizeof(error)); sctp_addto_chunk(retval, sizeof(error), error); nodata: return retval; } struct sctp_chunk *sctp_make_new_encap_port(const struct sctp_association *asoc, const struct sctp_chunk *chunk) { struct sctp_new_encap_port_hdr nep; struct sctp_chunk *retval; retval = sctp_make_abort(asoc, chunk, sizeof(struct sctp_errhdr) + sizeof(nep)); if (!retval) goto nodata; sctp_init_cause(retval, SCTP_ERROR_NEW_ENCAP_PORT, sizeof(nep)); nep.cur_port = SCTP_INPUT_CB(chunk->skb)->encap_port; nep.new_port = chunk->transport->encap_port; sctp_addto_chunk(retval, sizeof(nep), &nep); nodata: return retval; } /* Make a HEARTBEAT chunk. */ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, const struct sctp_transport *transport, __u32 probe_size) { struct sctp_sender_hb_info hbinfo = {}; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo), GFP_ATOMIC); if (!retval) goto nodata; hbinfo.param_hdr.type = SCTP_PARAM_HEARTBEAT_INFO; hbinfo.param_hdr.length = htons(sizeof(hbinfo)); hbinfo.daddr = transport->ipaddr; hbinfo.sent_at = jiffies; hbinfo.hb_nonce = transport->hb_nonce; hbinfo.probe_size = probe_size; /* Cast away the 'const', as this is just telling the chunk * what transport it belongs to. */ retval->transport = (struct sctp_transport *) transport; retval->subh.hbs_hdr = sctp_addto_chunk(retval, sizeof(hbinfo), &hbinfo); retval->pmtu_probe = !!probe_size; nodata: return retval; } struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, const struct sctp_chunk *chunk, const void *payload, const size_t paylen) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen, GFP_ATOMIC); if (!retval) goto nodata; retval->subh.hbs_hdr = sctp_addto_chunk(retval, paylen, payload); /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, * etc.) to the same destination transport * address from which it * received the DATA or control chunk * to which it is replying. * * [HBACK back to where the HEARTBEAT came from.] */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* RFC4820 3. Padding Chunk (PAD) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x84 | Flags=0 | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | | * \ Padding Data / * / \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_pad(const struct sctp_association *asoc, int len) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_PAD, 0, len, GFP_ATOMIC); if (!retval) return NULL; skb_put_zero(retval->skb, len); retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + len); retval->chunk_end = skb_tail_pointer(retval->skb); return retval; } /* Create an Operation Error chunk with the specified space reserved. * This routine can be used for containing multiple causes in the chunk. */ static struct sctp_chunk *sctp_make_op_error_space( const struct sctp_association *asoc, const struct sctp_chunk *chunk, size_t size) { struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0, sizeof(struct sctp_errhdr) + size, GFP_ATOMIC); if (!retval) goto nodata; /* RFC 2960 6.4 Multi-homed SCTP Endpoints * * An endpoint SHOULD transmit reply chunks (e.g., SACK, * HEARTBEAT ACK, etc.) to the same destination transport * address from which it received the DATA or control chunk * to which it is replying. * */ if (chunk) retval->transport = chunk->transport; nodata: return retval; } /* Create an Operation Error chunk of a fixed size, specifically, * min(asoc->pathmtu, SCTP_DEFAULT_MAXSEGMENT) - overheads. * This is a helper function to allocate an error chunk for those * invalid parameter codes in which we may not want to report all the * errors, if the incoming chunk is large. If it can't fit in a single * packet, we ignore it. */ static inline struct sctp_chunk *sctp_make_op_error_limited( const struct sctp_association *asoc, const struct sctp_chunk *chunk) { size_t size = SCTP_DEFAULT_MAXSEGMENT; struct sctp_sock *sp = NULL; if (asoc) { size = min_t(size_t, size, asoc->pathmtu); sp = sctp_sk(asoc->base.sk); } size = sctp_mtu_payload(sp, size, sizeof(struct sctp_errhdr)); return sctp_make_op_error_space(asoc, chunk, size); } /* Create an Operation Error chunk. */ struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, const struct sctp_chunk *chunk, __be16 cause_code, const void *payload, size_t paylen, size_t reserve_tail) { struct sctp_chunk *retval; retval = sctp_make_op_error_space(asoc, chunk, paylen + reserve_tail); if (!retval) goto nodata; sctp_init_cause(retval, cause_code, paylen + reserve_tail); sctp_addto_chunk(retval, paylen, payload); if (reserve_tail) sctp_addto_param(retval, reserve_tail, NULL); nodata: return retval; } struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id) { struct sctp_authhdr auth_hdr; const struct sctp_hmac *hmac_desc; struct sctp_chunk *retval; /* Get the first hmac that the peer told us to use */ hmac_desc = sctp_auth_asoc_get_hmac(asoc); if (unlikely(!hmac_desc)) return NULL; retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0, hmac_desc->hmac_len + sizeof(auth_hdr), GFP_ATOMIC); if (!retval) return NULL; auth_hdr.hmac_id = htons(hmac_desc->hmac_id); auth_hdr.shkey_id = htons(key_id); retval->subh.auth_hdr = sctp_addto_chunk(retval, sizeof(auth_hdr), &auth_hdr); skb_put_zero(retval->skb, hmac_desc->hmac_len); /* Adjust the chunk header to include the empty MAC */ retval->chunk_hdr->length = htons(ntohs(retval->chunk_hdr->length) + hmac_desc->hmac_len); retval->chunk_end = skb_tail_pointer(retval->skb); return retval; } /******************************************************************** * 2nd Level Abstractions ********************************************************************/ /* Turn an skb into a chunk. * FIXME: Eventually move the structure directly inside the skb->cb[]. * * sctpimpguide-05.txt Section 2.8.2 * M1) Each time a new DATA chunk is transmitted * set the 'TSN.Missing.Report' count for that TSN to 0. The * 'TSN.Missing.Report' count will be used to determine missing chunks * and when to fast retransmit. * */ struct sctp_chunk *sctp_chunkify(struct sk_buff *skb, const struct sctp_association *asoc, struct sock *sk, gfp_t gfp) { struct sctp_chunk *retval; retval = kmem_cache_zalloc(sctp_chunk_cachep, gfp); if (!retval) goto nodata; if (!sk) pr_debug("%s: chunkifying skb:%p w/o an sk\n", __func__, skb); INIT_LIST_HEAD(&retval->list); retval->skb = skb; retval->asoc = (struct sctp_association *)asoc; retval->singleton = 1; retval->fast_retransmit = SCTP_CAN_FRTX; /* Polish the bead hole. */ INIT_LIST_HEAD(&retval->transmitted_list); INIT_LIST_HEAD(&retval->frag_list); SCTP_DBG_OBJCNT_INC(chunk); refcount_set(&retval->refcnt, 1); nodata: return retval; } /* Set chunk->source and dest based on the IP header in chunk->skb. */ void sctp_init_addrs(struct sctp_chunk *chunk, union sctp_addr *src, union sctp_addr *dest) { memcpy(&chunk->source, src, sizeof(union sctp_addr)); memcpy(&chunk->dest, dest, sizeof(union sctp_addr)); } /* Extract the source address from a chunk. */ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk) { /* If we have a known transport, use that. */ if (chunk->transport) { return &chunk->transport->ipaddr; } else { /* Otherwise, extract it from the IP header. */ return &chunk->source; } } /* Create a new chunk, setting the type and flags headers from the * arguments, reserving enough space for a 'paylen' byte payload. */ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) { struct sctp_chunkhdr *chunk_hdr; struct sctp_chunk *retval; struct sk_buff *skb; struct sock *sk; int chunklen; chunklen = SCTP_PAD4(sizeof(*chunk_hdr) + paylen); if (chunklen > SCTP_MAX_CHUNK_LEN) goto nodata; /* No need to allocate LL here, as this is only a chunk. */ skb = alloc_skb(chunklen, gfp); if (!skb) goto nodata; /* Make room for the chunk header. */ chunk_hdr = (struct sctp_chunkhdr *)skb_put(skb, sizeof(*chunk_hdr)); chunk_hdr->type = type; chunk_hdr->flags = flags; chunk_hdr->length = htons(sizeof(*chunk_hdr)); sk = asoc ? asoc->base.sk : NULL; retval = sctp_chunkify(skb, asoc, sk, gfp); if (!retval) { kfree_skb(skb); goto nodata; } retval->chunk_hdr = chunk_hdr; retval->chunk_end = ((__u8 *)chunk_hdr) + sizeof(*chunk_hdr); /* Determine if the chunk needs to be authenticated */ if (sctp_auth_send_cid(type, asoc)) retval->auth = 1; return retval; nodata: return NULL; } static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp) { return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp); } struct sctp_chunk *sctp_make_idata(const struct sctp_association *asoc, __u8 flags, int paylen, gfp_t gfp) { return _sctp_make_chunk(asoc, SCTP_CID_I_DATA, flags, paylen, gfp); } static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc, __u8 type, __u8 flags, int paylen, gfp_t gfp) { struct sctp_chunk *chunk; chunk = _sctp_make_chunk(asoc, type, flags, paylen, gfp); if (chunk) sctp_control_set_owner_w(chunk); return chunk; } /* Release the memory occupied by a chunk. */ static void sctp_chunk_destroy(struct sctp_chunk *chunk) { BUG_ON(!list_empty(&chunk->list)); list_del_init(&chunk->transmitted_list); consume_skb(chunk->skb); consume_skb(chunk->auth_chunk); SCTP_DBG_OBJCNT_DEC(chunk); kmem_cache_free(sctp_chunk_cachep, chunk); } /* Possibly, free the chunk. */ void sctp_chunk_free(struct sctp_chunk *chunk) { /* Release our reference on the message tracker. */ if (chunk->msg) sctp_datamsg_put(chunk->msg); sctp_chunk_put(chunk); } /* Grab a reference to the chunk. */ void sctp_chunk_hold(struct sctp_chunk *ch) { refcount_inc(&ch->refcnt); } /* Release a reference to the chunk. */ void sctp_chunk_put(struct sctp_chunk *ch) { if (refcount_dec_and_test(&ch->refcnt)) sctp_chunk_destroy(ch); } /* Append bytes to the end of a chunk. Will panic if chunk is not big * enough. */ void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) { int chunklen = ntohs(chunk->chunk_hdr->length); int padlen = SCTP_PAD4(chunklen) - chunklen; void *target; skb_put_zero(chunk->skb, padlen); target = skb_put_data(chunk->skb, data, len); /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(chunklen + padlen + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return target; } /* Append bytes from user space to the end of a chunk. Will panic if * chunk is not big enough. * Returns a kernel err value. */ int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len, struct iov_iter *from) { void *target; /* Make room in chunk for data. */ target = skb_put(chunk->skb, len); /* Copy data (whole iovec) into chunk */ if (!copy_from_iter_full(target, len, from)) return -EFAULT; /* Adjust the chunk length field. */ chunk->chunk_hdr->length = htons(ntohs(chunk->chunk_hdr->length) + len); chunk->chunk_end = skb_tail_pointer(chunk->skb); return 0; } /* Helper function to assign a TSN if needed. This assumes that both * the data_hdr and association have already been assigned. */ void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; struct sctp_datamsg *msg; __u16 ssn, sid; if (chunk->has_ssn) return; /* All fragments will be on the same stream */ sid = ntohs(chunk->subh.data_hdr->stream); stream = &chunk->asoc->stream; /* Now assign the sequence number to the entire message. * All fragments must have the same stream sequence number. */ msg = chunk->msg; list_for_each_entry(lchunk, &msg->chunks, frag_list) { if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { ssn = 0; } else { if (lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG) ssn = sctp_ssn_next(stream, out, sid); else ssn = sctp_ssn_peek(stream, out, sid); } lchunk->subh.data_hdr->ssn = htons(ssn); lchunk->has_ssn = 1; } } /* Helper function to assign a TSN if needed. This assumes that both * the data_hdr and association have already been assigned. */ void sctp_chunk_assign_tsn(struct sctp_chunk *chunk) { if (!chunk->has_tsn) { /* This is the last possible instant to * assign a TSN. */ chunk->subh.data_hdr->tsn = htonl(sctp_association_get_next_tsn(chunk->asoc)); chunk->has_tsn = 1; } } /* Create a CLOSED association to use with an incoming packet. */ struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc; enum sctp_scope scope; struct sk_buff *skb; /* Create the bare association. */ scope = sctp_scope(sctp_source(chunk)); asoc = sctp_association_new(ep, ep->base.sk, scope, gfp); if (!asoc) goto nodata; asoc->temp = 1; skb = chunk->skb; /* Create an entry for the source address of the packet. */ SCTP_INPUT_CB(skb)->af->from_skb(&asoc->c.peer_addr, skb, 1); nodata: return asoc; } /* Build a cookie representing asoc. * This INCLUDES the param header needed to put the cookie in the INIT ACK. */ static struct sctp_cookie_param *sctp_pack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, const struct sctp_chunk *init_chunk, int *cookie_len, const __u8 *raw_addrs, int addrs_len) { struct sctp_signed_cookie *cookie; struct sctp_cookie_param *retval; int headersize, bodysize; /* Header size is static data prior to the actual cookie, including * any padding. */ headersize = sizeof(struct sctp_paramhdr) + (sizeof(struct sctp_signed_cookie) - sizeof(struct sctp_cookie)); bodysize = sizeof(struct sctp_cookie) + ntohs(init_chunk->chunk_hdr->length) + addrs_len; /* Pad out the cookie to a multiple to make the signature * functions simpler to write. */ if (bodysize % SCTP_COOKIE_MULTIPLE) bodysize += SCTP_COOKIE_MULTIPLE - (bodysize % SCTP_COOKIE_MULTIPLE); *cookie_len = headersize + bodysize; /* Clear this memory since we are sending this data structure * out on the network. */ retval = kzalloc(*cookie_len, GFP_ATOMIC); if (!retval) { *cookie_len = 0; return NULL; } cookie = (struct sctp_signed_cookie *) retval->body; /* Set up the parameter header. */ retval->p.type = SCTP_PARAM_STATE_COOKIE; retval->p.length = htons(*cookie_len); /* Copy the cookie part of the association itself. */ cookie->c = asoc->c; /* Save the raw address list length in the cookie. */ cookie->c.raw_addr_list_len = addrs_len; /* Remember PR-SCTP capability. */ cookie->c.prsctp_capable = asoc->peer.prsctp_capable; /* Save adaptation indication in the cookie. */ cookie->c.adaptation_ind = asoc->peer.adaptation_ind; /* Set an expiration time for the cookie. */ cookie->c.expiration = ktime_add(asoc->cookie_life, ktime_get_real()); /* Copy the peer's init packet. */ memcpy(cookie + 1, init_chunk->chunk_hdr, ntohs(init_chunk->chunk_hdr->length)); /* Copy the raw local address list of the association. */ memcpy((__u8 *)(cookie + 1) + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); /* Sign the cookie, if cookie authentication is enabled. */ if (sctp_sk(ep->base.sk)->cookie_auth_enable) { static_assert(sizeof(cookie->mac) == SHA256_DIGEST_SIZE); hmac_sha256(&ep->cookie_auth_key, (const u8 *)&cookie->c, bodysize, cookie->mac); } return retval; } /* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ struct sctp_association *sctp_unpack_cookie( const struct sctp_endpoint *ep, const struct sctp_association *asoc, struct sctp_chunk *chunk, gfp_t gfp, int *error, struct sctp_chunk **errp) { struct sctp_association *retval = NULL; int headersize, bodysize, fixed_size; struct sctp_signed_cookie *cookie; struct sk_buff *skb = chunk->skb; struct sctp_cookie *bear_cookie; enum sctp_scope scope; unsigned int len; ktime_t kt; /* Header size is static data prior to the actual cookie, including * any padding. */ headersize = sizeof(struct sctp_chunkhdr) + (sizeof(struct sctp_signed_cookie) - sizeof(struct sctp_cookie)); bodysize = ntohs(chunk->chunk_hdr->length) - headersize; fixed_size = headersize + sizeof(struct sctp_cookie); /* Verify that the chunk looks like it even has a cookie. * There must be enough room for our cookie and our peer's * INIT chunk. */ len = ntohs(chunk->chunk_hdr->length); if (len < fixed_size + sizeof(struct sctp_chunkhdr)) goto malformed; /* Verify that the cookie has been padded out. */ if (bodysize % SCTP_COOKIE_MULTIPLE) goto malformed; /* Process the cookie. */ cookie = chunk->subh.cookie_hdr; bear_cookie = &cookie->c; /* Verify the cookie's MAC, if cookie authentication is enabled. */ if (sctp_sk(ep->base.sk)->cookie_auth_enable) { u8 mac[SHA256_DIGEST_SIZE]; hmac_sha256(&ep->cookie_auth_key, (const u8 *)bear_cookie, bodysize, mac); static_assert(sizeof(cookie->mac) == sizeof(mac)); if (crypto_memneq(mac, cookie->mac, sizeof(mac))) { *error = -SCTP_IERROR_BAD_SIG; goto fail; } } /* IG Section 2.35.2: * 3) Compare the port numbers and the verification tag contained * within the COOKIE ECHO chunk to the actual port numbers and the * verification tag within the SCTP common header of the received * packet. If these values do not match the packet MUST be silently * discarded, */ if (ntohl(chunk->sctp_hdr->vtag) != bear_cookie->my_vtag) { *error = -SCTP_IERROR_BAD_TAG; goto fail; } if (chunk->sctp_hdr->source != bear_cookie->peer_addr.v4.sin_port || ntohs(chunk->sctp_hdr->dest) != bear_cookie->my_port) { *error = -SCTP_IERROR_BAD_PORTS; goto fail; } /* Check to see if the cookie is stale. If there is already * an association, there is no need to check cookie's expiration * for init collision case of lost COOKIE ACK. * If skb has been timestamped, then use the stamp, otherwise * use current time. This introduces a small possibility that * a cookie may be considered expired, but this would only slow * down the new association establishment instead of every packet. */ if (sock_flag(ep->base.sk, SOCK_TIMESTAMP)) kt = skb_get_ktime(skb); else kt = ktime_get_real(); if (!asoc && ktime_before(bear_cookie->expiration, kt)) { suseconds_t usecs = ktime_to_us(ktime_sub(kt, bear_cookie->expiration)); __be32 n = htonl(usecs); /* * Section 3.3.10.3 Stale Cookie Error (3) * * Cause of error * --------------- * Stale Cookie Error: Indicates the receipt of a valid State * Cookie that has expired. */ *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_STALE_COOKIE, &n, sizeof(n), 0); if (*errp) *error = -SCTP_IERROR_STALE_COOKIE; else *error = -SCTP_IERROR_NOMEM; goto fail; } /* Make a new base association. */ scope = sctp_scope(sctp_source(chunk)); retval = sctp_association_new(ep, ep->base.sk, scope, gfp); if (!retval) { *error = -SCTP_IERROR_NOMEM; goto fail; } /* Set up our peer's port number. */ retval->peer.port = ntohs(chunk->sctp_hdr->source); /* Populate the association from the cookie. */ memcpy(&retval->c, bear_cookie, sizeof(*bear_cookie)); if (sctp_assoc_set_bind_addr_from_cookie(retval, bear_cookie, GFP_ATOMIC) < 0) { *error = -SCTP_IERROR_NOMEM; goto fail; } /* Also, add the destination address. */ if (list_empty(&retval->base.bind_addr.address_list)) { sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, sizeof(chunk->dest), SCTP_ADDR_SRC, GFP_ATOMIC); } retval->next_tsn = retval->c.initial_tsn; retval->ctsn_ack_point = retval->next_tsn - 1; retval->addip_serial = retval->c.initial_tsn; retval->strreset_outseq = retval->c.initial_tsn; retval->adv_peer_ack_point = retval->ctsn_ack_point; retval->peer.prsctp_capable = retval->c.prsctp_capable; retval->peer.adaptation_ind = retval->c.adaptation_ind; /* The INIT stuff will be done by the side effects. */ return retval; fail: if (retval) sctp_association_free(retval); return NULL; malformed: /* Yikes! The packet is either corrupt or deliberately * malformed. */ *error = -SCTP_IERROR_MALFORMED; goto fail; } /******************************************************************** * 3rd Level Abstractions ********************************************************************/ struct __sctp_missing { __be32 num_missing; __be16 type; } __packed; /* * Report a missing mandatory parameter. */ static int sctp_process_missing_param(const struct sctp_association *asoc, enum sctp_param paramtype, struct sctp_chunk *chunk, struct sctp_chunk **errp) { struct __sctp_missing report; __u16 len; len = SCTP_PAD4(sizeof(report)); /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ if (!*errp) *errp = sctp_make_op_error_space(asoc, chunk, len); if (*errp) { report.num_missing = htonl(1); report.type = paramtype; sctp_init_cause(*errp, SCTP_ERROR_MISS_PARAM, sizeof(report)); sctp_addto_chunk(*errp, sizeof(report), &report); } /* Stop processing this chunk. */ return 0; } /* Report an Invalid Mandatory Parameter. */ static int sctp_process_inv_mandatory(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_chunk **errp) { /* Invalid Mandatory Parameter Error has no payload. */ if (!*errp) *errp = sctp_make_op_error_space(asoc, chunk, 0); if (*errp) sctp_init_cause(*errp, SCTP_ERROR_INV_PARAM, 0); /* Stop processing this chunk. */ return 0; } static int sctp_process_inv_paramlength(const struct sctp_association *asoc, struct sctp_paramhdr *param, const struct sctp_chunk *chunk, struct sctp_chunk **errp) { /* This is a fatal error. Any accumulated non-fatal errors are * not reported. */ if (*errp) sctp_chunk_free(*errp); /* Create an error chunk and fill it in with our payload. */ *errp = sctp_make_violation_paramlen(asoc, chunk, param); return 0; } /* Do not attempt to handle the HOST_NAME parm. However, do * send back an indicator to the peer. */ static int sctp_process_hn_param(const struct sctp_association *asoc, union sctp_params param, struct sctp_chunk *chunk, struct sctp_chunk **errp) { __u16 len = ntohs(param.p->length); /* Processing of the HOST_NAME parameter will generate an * ABORT. If we've accumulated any non-fatal errors, they * would be unrecognized parameters and we should not include * them in the ABORT. */ if (*errp) sctp_chunk_free(*errp); *errp = sctp_make_op_error(asoc, chunk, SCTP_ERROR_DNS_FAILED, param.v, len, 0); /* Stop processing this chunk. */ return 0; } static int sctp_verify_ext_param(struct net *net, const struct sctp_endpoint *ep, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int have_asconf = 0; int have_auth = 0; int i; for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_AUTH: have_auth = 1; break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: have_asconf = 1; break; } } /* ADD-IP Security: The draft requires us to ABORT or ignore the * INIT/INIT-ACK if ADD-IP is listed, but AUTH is not. Do this * only if ADD-IP is turned on and we are not backward-compatible * mode. */ if (net->sctp.addip_noauth) return 1; if (ep->asconf_enable && !have_auth && have_asconf) return 0; return 1; } static void sctp_process_ext_param(struct sctp_association *asoc, union sctp_params param) { __u16 num_ext = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); int i; for (i = 0; i < num_ext; i++) { switch (param.ext->chunks[i]) { case SCTP_CID_RECONF: if (asoc->ep->reconf_enable) asoc->peer.reconf_capable = 1; break; case SCTP_CID_FWD_TSN: if (asoc->ep->prsctp_enable) asoc->peer.prsctp_capable = 1; break; case SCTP_CID_AUTH: /* if the peer reports AUTH, assume that he * supports AUTH. */ if (asoc->ep->auth_enable) asoc->peer.auth_capable = 1; break; case SCTP_CID_ASCONF: case SCTP_CID_ASCONF_ACK: if (asoc->ep->asconf_enable) asoc->peer.asconf_capable = 1; break; case SCTP_CID_I_DATA: if (asoc->ep->intl_enable) asoc->peer.intl_capable = 1; break; default: break; } } } /* RFC 3.2.1 & the Implementers Guide 2.2. * * The Parameter Types are encoded such that the * highest-order two bits specify the action that must be * taken if the processing endpoint does not recognize the * Parameter Type. * * 00 - Stop processing this parameter; do not process any further * parameters within this chunk * * 01 - Stop processing this parameter, do not process any further * parameters within this chunk, and report the unrecognized * parameter in an 'Unrecognized Parameter' ERROR chunk. * * 10 - Skip this parameter and continue processing. * * 11 - Skip this parameter and continue processing but * report the unrecognized parameter in an * 'Unrecognized Parameter' ERROR chunk. * * Return value: * SCTP_IERROR_NO_ERROR - continue with the chunk * SCTP_IERROR_ERROR - stop and report an error. * SCTP_IERROR_NOMEME - out of memory. */ static enum sctp_ierror sctp_process_unk_param( const struct sctp_association *asoc, union sctp_params param, struct sctp_chunk *chunk, struct sctp_chunk **errp) { int retval = SCTP_IERROR_NO_ERROR; switch (param.p->type & SCTP_PARAM_ACTION_MASK) { case SCTP_PARAM_ACTION_DISCARD: retval = SCTP_IERROR_ERROR; break; case SCTP_PARAM_ACTION_SKIP: break; case SCTP_PARAM_ACTION_DISCARD_ERR: retval = SCTP_IERROR_ERROR; fallthrough; case SCTP_PARAM_ACTION_SKIP_ERR: /* Make an ERROR chunk, preparing enough room for * returning multiple unknown parameters. */ if (!*errp) { *errp = sctp_make_op_error_limited(asoc, chunk); if (!*errp) { /* If there is no memory for generating the * ERROR report as specified, an ABORT will be * triggered to the peer and the association * won't be established. */ retval = SCTP_IERROR_NOMEM; break; } } if (!sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM, ntohs(param.p->length))) sctp_addto_chunk(*errp, ntohs(param.p->length), param.v); break; default: break; } return retval; } /* Verify variable length parameters * Return values: * SCTP_IERROR_ABORT - trigger an ABORT * SCTP_IERROR_NOMEM - out of memory (abort) * SCTP_IERROR_ERROR - stop processing, trigger an ERROR * SCTP_IERROR_NO_ERROR - continue with the chunk */ static enum sctp_ierror sctp_verify_param(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, union sctp_params param, enum sctp_cid cid, struct sctp_chunk *chunk, struct sctp_chunk **err_chunk) { struct sctp_hmac_algo_param *hmacs; int retval = SCTP_IERROR_NO_ERROR; __u16 n_elt, id = 0; int i; /* FIXME - This routine is not looking at each parameter per the * chunk type, i.e., unrecognized parameters should be further * identified based on the chunk id. */ switch (param.p->type) { case SCTP_PARAM_IPV4_ADDRESS: case SCTP_PARAM_IPV6_ADDRESS: case SCTP_PARAM_COOKIE_PRESERVATIVE: case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: case SCTP_PARAM_STATE_COOKIE: case SCTP_PARAM_HEARTBEAT_INFO: case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: case SCTP_PARAM_ECN_CAPABLE: case SCTP_PARAM_ADAPTATION_LAYER_IND: break; case SCTP_PARAM_SUPPORTED_EXT: if (!sctp_verify_ext_param(net, ep, param)) return SCTP_IERROR_ABORT; break; case SCTP_PARAM_SET_PRIMARY: if (!ep->asconf_enable) goto unhandled; if (ntohs(param.p->length) < sizeof(struct sctp_addip_param) + sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_HOST_NAME_ADDRESS: /* This param has been Deprecated, send ABORT. */ sctp_process_hn_param(asoc, param, chunk, err_chunk); retval = SCTP_IERROR_ABORT; break; case SCTP_PARAM_FWD_TSN_SUPPORT: if (ep->prsctp_enable) break; goto unhandled; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) goto unhandled; /* SCTP-AUTH: Secion 6.1 * If the random number is not 32 byte long the association * MUST be aborted. The ABORT chunk SHOULD contain the error * cause 'Protocol Violation'. */ if (SCTP_AUTH_RANDOM_LENGTH != ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) goto unhandled; /* SCTP-AUTH: Section 3.2 * The CHUNKS parameter MUST be included once in the INIT or * INIT-ACK chunk if the sender wants to receive authenticated * chunks. Its maximum length is 260 bytes. */ if (260 < ntohs(param.p->length)) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) goto unhandled; hmacs = (struct sctp_hmac_algo_param *)param.p; n_elt = (ntohs(param.p->length) - sizeof(struct sctp_paramhdr)) >> 1; /* SCTP-AUTH: Section 6.1 * The HMAC algorithm based on SHA-1 MUST be supported and * included in the HMAC-ALGO parameter. */ for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); if (id == SCTP_AUTH_HMAC_ID_SHA1) break; } if (id != SCTP_AUTH_HMAC_ID_SHA1) { sctp_process_inv_paramlength(asoc, param.p, chunk, err_chunk); retval = SCTP_IERROR_ABORT; } break; unhandled: default: pr_debug("%s: unrecognized param:%d for chunk:%d\n", __func__, ntohs(param.p->type), cid); retval = sctp_process_unk_param(asoc, param, chunk, err_chunk); break; } return retval; } /* Verify the INIT packet before we process it. */ int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep, const struct sctp_association *asoc, enum sctp_cid cid, struct sctp_init_chunk *peer_init, struct sctp_chunk *chunk, struct sctp_chunk **errp) { union sctp_params param; bool has_cookie = false; int result; /* Check for missing mandatory parameters. Note: Initial TSN is * also mandatory, but is not checked here since the valid range * is 0..2**32-1. RFC4960, section 3.3.3. */ if (peer_init->init_hdr.num_outbound_streams == 0 || peer_init->init_hdr.num_inbound_streams == 0 || peer_init->init_hdr.init_tag == 0 || ntohl(peer_init->init_hdr.a_rwnd) < SCTP_DEFAULT_MINWINDOW) return sctp_process_inv_mandatory(asoc, chunk, errp); sctp_walk_params(param, peer_init) { if (param.p->type == SCTP_PARAM_STATE_COOKIE) has_cookie = true; } /* There is a possibility that a parameter length was bad and * in that case we would have stoped walking the parameters. * The current param.p would point at the bad one. * Current consensus on the mailing list is to generate a PROTOCOL * VIOLATION error. We build the ERROR chunk here and let the normal * error handling code build and send the packet. */ if (param.v != (void *)chunk->chunk_end) return sctp_process_inv_paramlength(asoc, param.p, chunk, errp); /* The only missing mandatory param possible today is * the state cookie for an INIT-ACK chunk. */ if ((SCTP_CID_INIT_ACK == cid) && !has_cookie) return sctp_process_missing_param(asoc, SCTP_PARAM_STATE_COOKIE, chunk, errp); /* Verify all the variable length parameters */ sctp_walk_params(param, peer_init) { result = sctp_verify_param(net, ep, asoc, param, cid, chunk, errp); switch (result) { case SCTP_IERROR_ABORT: case SCTP_IERROR_NOMEM: return 0; case SCTP_IERROR_ERROR: return 1; case SCTP_IERROR_NO_ERROR: default: break; } } /* for (loop through all parameters) */ return 1; } /* Unpack the parameters in an INIT packet into an association. * Returns 0 on failure, else success. * FIXME: This is an association method. */ int sctp_process_init(struct sctp_association *asoc, struct sctp_chunk *chunk, const union sctp_addr *peer_addr, struct sctp_init_chunk *peer_init, gfp_t gfp) { struct sctp_transport *transport; struct list_head *pos, *temp; union sctp_params param; union sctp_addr addr; struct sctp_af *af; int src_match = 0; /* We must include the address that the INIT packet came from. * This is the only address that matters for an INIT packet. * When processing a COOKIE ECHO, we retrieve the from address * of the INIT from the cookie. */ /* This implementation defaults to making the first transport * added as the primary transport. The source address seems to * be a better choice than any of the embedded addresses. */ asoc->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port; if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE)) goto nomem; if (sctp_cmp_addr_exact(sctp_source(chunk), peer_addr)) src_match = 1; /* Process the initialization parameters. */ sctp_walk_params(param, peer_init) { if (!src_match && (param.p->type == SCTP_PARAM_IPV4_ADDRESS || param.p->type == SCTP_PARAM_IPV6_ADDRESS)) { af = sctp_get_af_specific(param_type2af(param.p->type)); if (!af->from_addr_param(&addr, param.addr, chunk->sctp_hdr->source, 0)) continue; if (sctp_cmp_addr_exact(sctp_source(chunk), &addr)) src_match = 1; } if (!sctp_process_param(asoc, param, peer_addr, gfp)) goto clean_up; } /* source address of chunk may not match any valid address */ if (!src_match) goto clean_up; /* AUTH: After processing the parameters, make sure that we * have all the required info to potentially do authentications. */ if (asoc->peer.auth_capable && (!asoc->peer.peer_random || !asoc->peer.peer_hmacs)) asoc->peer.auth_capable = 0; /* In a non-backward compatible mode, if the peer claims * support for ADD-IP but not AUTH, the ADD-IP spec states * that we MUST ABORT the association. Section 6. The section * also give us an option to silently ignore the packet, which * is what we'll do here. */ if (!asoc->base.net->sctp.addip_noauth && (asoc->peer.asconf_capable && !asoc->peer.auth_capable)) { asoc->peer.addip_disabled_mask |= (SCTP_PARAM_ADD_IP | SCTP_PARAM_DEL_IP | SCTP_PARAM_SET_PRIMARY); asoc->peer.asconf_capable = 0; goto clean_up; } /* Walk list of transports, removing transports in the UNKNOWN state. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); if (transport->state == SCTP_UNKNOWN) { sctp_assoc_rm_peer(asoc, transport); } } /* The fixed INIT headers are always in network byte * order. */ asoc->peer.i.init_tag = ntohl(peer_init->init_hdr.init_tag); asoc->peer.i.a_rwnd = ntohl(peer_init->init_hdr.a_rwnd); asoc->peer.i.num_outbound_streams = ntohs(peer_init->init_hdr.num_outbound_streams); asoc->peer.i.num_inbound_streams = ntohs(peer_init->init_hdr.num_inbound_streams); asoc->peer.i.initial_tsn = ntohl(peer_init->init_hdr.initial_tsn); asoc->strreset_inseq = asoc->peer.i.initial_tsn; /* Apply the upper bounds for output streams based on peer's * number of inbound streams. */ if (asoc->c.sinit_num_ostreams > ntohs(peer_init->init_hdr.num_inbound_streams)) { asoc->c.sinit_num_ostreams = ntohs(peer_init->init_hdr.num_inbound_streams); } if (asoc->c.sinit_max_instreams > ntohs(peer_init->init_hdr.num_outbound_streams)) { asoc->c.sinit_max_instreams = ntohs(peer_init->init_hdr.num_outbound_streams); } /* Copy Initiation tag from INIT to VT_peer in cookie. */ asoc->c.peer_vtag = asoc->peer.i.init_tag; /* Peer Rwnd : Current calculated value of the peer's rwnd. */ asoc->peer.rwnd = asoc->peer.i.a_rwnd; /* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily * high (for example, implementations MAY use the size of the receiver * advertised window). */ list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { transport->ssthresh = asoc->peer.i.a_rwnd; } /* Set up the TSN tracking pieces. */ if (!sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, asoc->peer.i.initial_tsn, gfp)) goto clean_up; /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number * * The stream sequence number in all the streams shall start * from 0 when the association is established. Also, when the * stream sequence number reaches the value 65535 the next * stream sequence number shall be set to 0. */ if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, asoc->c.sinit_max_instreams, gfp)) goto clean_up; /* Update frag_point when stream_interleave may get changed. */ sctp_assoc_update_frag_point(asoc); if (!asoc->temp && sctp_assoc_set_id(asoc, gfp)) goto clean_up; /* ADDIP Section 4.1 ASCONF Chunk Procedures * * When an endpoint has an ASCONF signaled change to be sent to the * remote endpoint it should do the following: * ... * A2) A serial number should be assigned to the Chunk. The serial * number should be a monotonically increasing number. All serial * numbers are defined to be initialized at the start of the * association to the same value as the Initial TSN. */ asoc->peer.addip_serial = asoc->peer.i.initial_tsn - 1; return 1; clean_up: /* Release the transport structures. */ list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { transport = list_entry(pos, struct sctp_transport, transports); if (transport->state != SCTP_ACTIVE) sctp_assoc_rm_peer(asoc, transport); } nomem: return 0; } /* Update asoc with the option described in param. * * RFC2960 3.3.2.1 Optional/Variable Length Parameters in INIT * * asoc is the association to update. * param is the variable length parameter to use for update. * cid tells us if this is an INIT, INIT ACK or COOKIE ECHO. * If the current packet is an INIT we want to minimize the amount of * work we do. In particular, we should not build transport * structures for the addresses. */ static int sctp_process_param(struct sctp_association *asoc, union sctp_params param, const union sctp_addr *peer_addr, gfp_t gfp) { struct sctp_endpoint *ep = asoc->ep; union sctp_addr_param *addr_param; struct net *net = asoc->base.net; struct sctp_transport *t; enum sctp_scope scope; union sctp_addr addr; struct sctp_af *af; int retval = 1, i; u32 stale; __u16 sat; /* We maintain all INIT parameters in network byte order all the * time. This allows us to not worry about whether the parameters * came from a fresh INIT, and INIT ACK, or were stored in a cookie. */ switch (param.p->type) { case SCTP_PARAM_IPV6_ADDRESS: if (PF_INET6 != asoc->base.sk->sk_family) break; goto do_addr_param; case SCTP_PARAM_IPV4_ADDRESS: /* v4 addresses are not allowed on v6-only socket */ if (ipv6_only_sock(asoc->base.sk)) break; do_addr_param: af = sctp_get_af_specific(param_type2af(param.p->type)); if (!af->from_addr_param(&addr, param.addr, htons(asoc->peer.port), 0)) break; scope = sctp_scope(peer_addr); if (sctp_in_scope(net, &addr, scope)) if (!sctp_assoc_add_peer(asoc, &addr, gfp, SCTP_UNCONFIRMED)) return 0; break; case SCTP_PARAM_COOKIE_PRESERVATIVE: if (!net->sctp.cookie_preserve_enable) break; stale = ntohl(param.life->lifespan_increment); /* Suggested Cookie Life span increment's unit is msec, * (1/1000sec). */ asoc->cookie_life = ktime_add_ms(asoc->cookie_life, stale); break; case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: /* Turn off the default values first so we'll know which * ones are really set by the peer. */ asoc->peer.ipv4_address = 0; asoc->peer.ipv6_address = 0; /* Assume that peer supports the address family * by which it sends a packet. */ if (peer_addr->sa.sa_family == AF_INET6) asoc->peer.ipv6_address = 1; else if (peer_addr->sa.sa_family == AF_INET) asoc->peer.ipv4_address = 1; /* Cycle through address types; avoid divide by 0. */ sat = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); if (sat) sat /= sizeof(__u16); for (i = 0; i < sat; ++i) { switch (param.sat->types[i]) { case SCTP_PARAM_IPV4_ADDRESS: asoc->peer.ipv4_address = 1; break; case SCTP_PARAM_IPV6_ADDRESS: if (PF_INET6 == asoc->base.sk->sk_family) asoc->peer.ipv6_address = 1; break; default: /* Just ignore anything else. */ break; } } break; case SCTP_PARAM_STATE_COOKIE: asoc->peer.cookie_len = ntohs(param.p->length) - sizeof(struct sctp_paramhdr); kfree(asoc->peer.cookie); asoc->peer.cookie = kmemdup(param.cookie->body, asoc->peer.cookie_len, gfp); if (!asoc->peer.cookie) retval = 0; break; case SCTP_PARAM_HEARTBEAT_INFO: /* Would be odd to receive, but it causes no problems. */ break; case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: /* Rejected during verify stage. */ break; case SCTP_PARAM_ECN_CAPABLE: if (asoc->ep->ecn_enable) { asoc->peer.ecn_capable = 1; break; } /* Fall Through */ goto fall_through; case SCTP_PARAM_ADAPTATION_LAYER_IND: asoc->peer.adaptation_ind = ntohl(param.aind->adaptation_ind); break; case SCTP_PARAM_SET_PRIMARY: if (!ep->asconf_enable) goto fall_through; addr_param = param.v + sizeof(struct sctp_addip_param); af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (!af) break; if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) break; if (!af->addr_valid(&addr, NULL, NULL)) break; t = sctp_assoc_lookup_paddr(asoc, &addr); if (!t) break; sctp_assoc_set_primary(asoc, t); break; case SCTP_PARAM_SUPPORTED_EXT: sctp_process_ext_param(asoc, param); break; case SCTP_PARAM_FWD_TSN_SUPPORT: if (asoc->ep->prsctp_enable) { asoc->peer.prsctp_capable = 1; break; } /* Fall Through */ goto fall_through; case SCTP_PARAM_RANDOM: if (!ep->auth_enable) goto fall_through; /* Save peer's random parameter */ kfree(asoc->peer.peer_random); asoc->peer.peer_random = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_random) { retval = 0; break; } break; case SCTP_PARAM_HMAC_ALGO: if (!ep->auth_enable) goto fall_through; /* Save peer's HMAC list */ kfree(asoc->peer.peer_hmacs); asoc->peer.peer_hmacs = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_hmacs) { retval = 0; break; } /* Set the default HMAC the peer requested*/ sctp_auth_asoc_set_default_hmac(asoc, param.hmac_algo); break; case SCTP_PARAM_CHUNKS: if (!ep->auth_enable) goto fall_through; kfree(asoc->peer.peer_chunks); asoc->peer.peer_chunks = kmemdup(param.p, ntohs(param.p->length), gfp); if (!asoc->peer.peer_chunks) retval = 0; break; fall_through: default: /* Any unrecognized parameters should have been caught * and handled by sctp_verify_param() which should be * called prior to this routine. Simply log the error * here. */ pr_debug("%s: ignoring param:%d for association:%p.\n", __func__, ntohs(param.p->type), asoc); break; } return retval; } /* Select a new verification tag. */ __u32 sctp_generate_tag(const struct sctp_endpoint *ep) { /* I believe that this random number generator complies with RFC1750. * A tag of 0 is reserved for special cases (e.g. INIT). */ __u32 x; do { get_random_bytes(&x, sizeof(__u32)); } while (x == 0); return x; } /* Select an initial TSN to send during startup. */ __u32 sctp_generate_tsn(const struct sctp_endpoint *ep) { __u32 retval; get_random_bytes(&retval, sizeof(__u32)); return retval; } /* * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC1 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Serial Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter #1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / .... / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter #N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Address Parameter and other parameter will not be wrapped in this function */ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc, union sctp_addr *addr, int vparam_len) { struct sctp_addiphdr asconf; struct sctp_chunk *retval; int length = sizeof(asconf) + vparam_len; union sctp_addr_param addrparam; int addrlen; struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); addrlen = af->to_addr_param(addr, &addrparam); if (!addrlen) return NULL; length += addrlen; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length, GFP_ATOMIC); if (!retval) return NULL; asconf.serial = htonl(asoc->addip_serial++); retval->subh.addip_hdr = sctp_addto_chunk(retval, sizeof(asconf), &asconf); retval->param_hdr.v = sctp_addto_chunk(retval, addrlen, &addrparam); return retval; } /* ADDIP * 3.2.1 Add IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC001 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * 3.2.2 Delete IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0xC002 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * */ struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, union sctp_addr *laddr, struct sockaddr *addrs, int addrcnt, __be16 flags) { union sctp_addr_param addr_param; struct sctp_addip_param param; int paramlen = sizeof(param); struct sctp_chunk *retval; int addr_param_len = 0; union sctp_addr *addr; int totallen = 0, i; int del_pickup = 0; struct sctp_af *af; void *addr_buf; /* Get total length of all the address parameters. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); totallen += paramlen; totallen += addr_param_len; addr_buf += af->sockaddr_len; if (asoc->asconf_addr_del_pending && !del_pickup) { /* reuse the parameter length from the same scope one */ totallen += paramlen; totallen += addr_param_len; del_pickup = 1; pr_debug("%s: picked same-scope del_pending addr, " "totallen for all addresses is %d\n", __func__, totallen); } } /* Create an asconf chunk with the required length. */ retval = sctp_make_asconf(asoc, laddr, totallen); if (!retval) return NULL; /* Add the address parameters to the asconf chunk. */ addr_buf = addrs; for (i = 0; i < addrcnt; i++) { addr = addr_buf; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = flags; param.param_hdr.length = htons(paramlen + addr_param_len); param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, &param); sctp_addto_chunk(retval, addr_param_len, &addr_param); addr_buf += af->sockaddr_len; } if (flags == SCTP_PARAM_ADD_IP && del_pickup) { addr = asoc->asconf_addr_del_pending; af = sctp_get_af_specific(addr->v4.sin_family); addr_param_len = af->to_addr_param(addr, &addr_param); param.param_hdr.type = SCTP_PARAM_DEL_IP; param.param_hdr.length = htons(paramlen + addr_param_len); param.crr_id = htonl(i); sctp_addto_chunk(retval, paramlen, &param); sctp_addto_chunk(retval, addr_param_len, &addr_param); } return retval; } /* ADDIP * 3.2.4 Set Primary IP Address * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type =0xC004 | Length = Variable | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF-Request Correlation ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Address Parameter | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Create an ASCONF chunk with Set Primary IP address parameter. */ struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc, union sctp_addr *addr) { struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); union sctp_addr_param addrparam; struct sctp_addip_param param; struct sctp_chunk *retval; int len = sizeof(param); int addrlen; addrlen = af->to_addr_param(addr, &addrparam); if (!addrlen) return NULL; len += addrlen; /* Create the chunk and make asconf header. */ retval = sctp_make_asconf(asoc, addr, len); if (!retval) return NULL; param.param_hdr.type = SCTP_PARAM_SET_PRIMARY; param.param_hdr.length = htons(len); param.crr_id = 0; sctp_addto_chunk(retval, sizeof(param), &param); sctp_addto_chunk(retval, addrlen, &addrparam); return retval; } /* ADDIP 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 0x80 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Serial Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter Response#1 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / .... / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | ASCONF Parameter Response#N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * Create an ASCONF_ACK chunk with enough space for the parameter responses. */ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc, __u32 serial, int vparam_len) { struct sctp_addiphdr asconf; struct sctp_chunk *retval; int length = sizeof(asconf) + vparam_len; /* Create the chunk. */ retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length, GFP_ATOMIC); if (!retval) return NULL; asconf.serial = htonl(serial); retval->subh.addip_hdr = sctp_addto_chunk(retval, sizeof(asconf), &asconf); return retval; } /* Add response parameters to an ASCONF_ACK chunk. */ static void sctp_add_asconf_response(struct sctp_chunk *chunk, __be32 crr_id, __be16 err_code, struct sctp_addip_param *asconf_param) { struct sctp_addip_param ack_param; struct sctp_errhdr err_param; int asconf_param_len = 0; int err_param_len = 0; __be16 response_type; if (SCTP_ERROR_NO_ERROR == err_code) { response_type = SCTP_PARAM_SUCCESS_REPORT; } else { response_type = SCTP_PARAM_ERR_CAUSE; err_param_len = sizeof(err_param); if (asconf_param) asconf_param_len = ntohs(asconf_param->param_hdr.length); } /* Add Success Indication or Error Cause Indication parameter. */ ack_param.param_hdr.type = response_type; ack_param.param_hdr.length = htons(sizeof(ack_param) + err_param_len + asconf_param_len); ack_param.crr_id = crr_id; sctp_addto_chunk(chunk, sizeof(ack_param), &ack_param); if (SCTP_ERROR_NO_ERROR == err_code) return; /* Add Error Cause parameter. */ err_param.cause = err_code; err_param.length = htons(err_param_len + asconf_param_len); sctp_addto_chunk(chunk, err_param_len, &err_param); /* Add the failed TLV copied from ASCONF chunk. */ if (asconf_param) sctp_addto_chunk(chunk, asconf_param_len, asconf_param); } /* Process a asconf parameter. */ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, struct sctp_chunk *asconf, struct sctp_addip_param *asconf_param) { union sctp_addr_param *addr_param; struct sctp_transport *peer; union sctp_addr addr; struct sctp_af *af; addr_param = (void *)asconf_param + sizeof(*asconf_param); if (asconf_param->param_hdr.type != SCTP_PARAM_ADD_IP && asconf_param->param_hdr.type != SCTP_PARAM_DEL_IP && asconf_param->param_hdr.type != SCTP_PARAM_SET_PRIMARY) return SCTP_ERROR_UNKNOWN_PARAM; switch (addr_param->p.type) { case SCTP_PARAM_IPV6_ADDRESS: if (!asoc->peer.ipv6_address) return SCTP_ERROR_DNS_FAILED; break; case SCTP_PARAM_IPV4_ADDRESS: if (!asoc->peer.ipv4_address) return SCTP_ERROR_DNS_FAILED; break; default: return SCTP_ERROR_DNS_FAILED; } af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (unlikely(!af)) return SCTP_ERROR_DNS_FAILED; if (!af->from_addr_param(&addr, addr_param, htons(asoc->peer.port), 0)) return SCTP_ERROR_DNS_FAILED; /* ADDIP 4.2.1 This parameter MUST NOT contain a broadcast * or multicast address. * (note: wildcard is permitted and requires special handling so * make sure we check for that) */ if (!af->is_any(&addr) && !af->addr_valid(&addr, NULL, asconf->skb)) return SCTP_ERROR_DNS_FAILED; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: /* Section 4.2.1: * If the address 0.0.0.0 or ::0 is provided, the source * address of the packet MUST be added. */ if (af->is_any(&addr)) memcpy(&addr, &asconf->source, sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_ADD_IP, (struct sockaddr *)&addr, af->sockaddr_len)) return SCTP_ERROR_REQ_REFUSED; /* ADDIP 4.3 D9) If an endpoint receives an ADD IP address * request and does not have the local resources to add this * new address to the association, it MUST return an Error * Cause TLV set to the new error code 'Operation Refused * Due to Resource Shortage'. */ peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC, SCTP_UNCONFIRMED); if (!peer) return SCTP_ERROR_RSRC_LOW; /* Start the heartbeat timer. */ sctp_transport_reset_hb_timer(peer); asoc->new_transport = peer; break; case SCTP_PARAM_DEL_IP: /* ADDIP 4.3 D7) If a request is received to delete the * last remaining IP address of a peer endpoint, the receiver * MUST send an Error Cause TLV with the error cause set to the * new error code 'Request to Delete Last Remaining IP Address'. */ if (asoc->peer.transport_count == 1) return SCTP_ERROR_DEL_LAST_IP; /* ADDIP 4.3 D8) If a request is received to delete an IP * address which is also the source address of the IP packet * which contained the ASCONF chunk, the receiver MUST reject * this request. To reject the request the receiver MUST send * an Error Cause TLV set to the new error code 'Request to * Delete Source IP Address' */ if (sctp_cmp_addr_exact(&asconf->source, &addr)) return SCTP_ERROR_DEL_SRC_IP; /* Section 4.2.2 * If the address 0.0.0.0 or ::0 is provided, all * addresses of the peer except the source address of the * packet MUST be deleted. */ if (af->is_any(&addr)) { sctp_assoc_set_primary(asoc, asconf->transport); sctp_assoc_del_nonprimary_peers(asoc, asconf->transport); return SCTP_ERROR_NO_ERROR; } /* If the address is not part of the association, the * ASCONF-ACK with Error Cause Indication Parameter * which including cause of Unresolvable Address should * be sent. */ peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_DNS_FAILED; sctp_assoc_rm_peer(asoc, peer); break; case SCTP_PARAM_SET_PRIMARY: /* ADDIP Section 4.2.4 * If the address 0.0.0.0 or ::0 is provided, the receiver * MAY mark the source address of the packet as its * primary. */ if (af->is_any(&addr)) memcpy(&addr, sctp_source(asconf), sizeof(addr)); if (security_sctp_bind_connect(asoc->ep->base.sk, SCTP_PARAM_SET_PRIMARY, (struct sockaddr *)&addr, af->sockaddr_len)) return SCTP_ERROR_REQ_REFUSED; peer = sctp_assoc_lookup_paddr(asoc, &addr); if (!peer) return SCTP_ERROR_DNS_FAILED; sctp_assoc_set_primary(asoc, peer); break; } return SCTP_ERROR_NO_ERROR; } /* Verify the ASCONF packet before we process it. */ bool sctp_verify_asconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, bool addr_param_needed, struct sctp_paramhdr **errp) { struct sctp_addip_chunk *addip; bool addr_param_seen = false; union sctp_params param; addip = (struct sctp_addip_chunk *)chunk->chunk_hdr; sctp_walk_params(param, addip) { size_t length = ntohs(param.p->length); *errp = param.p; switch (param.p->type) { case SCTP_PARAM_ERR_CAUSE: break; case SCTP_PARAM_IPV4_ADDRESS: if (length != sizeof(struct sctp_ipv4addr_param)) return false; /* ensure there is only one addr param and it's in the * beginning of addip_hdr params, or we reject it. */ if (param.v != (addip + 1)) return false; addr_param_seen = true; break; case SCTP_PARAM_IPV6_ADDRESS: if (length != sizeof(struct sctp_ipv6addr_param)) return false; if (param.v != (addip + 1)) return false; addr_param_seen = true; break; case SCTP_PARAM_ADD_IP: case SCTP_PARAM_DEL_IP: case SCTP_PARAM_SET_PRIMARY: /* In ASCONF chunks, these need to be first. */ if (addr_param_needed && !addr_param_seen) return false; length = ntohs(param.addip->param_hdr.length); if (length < sizeof(struct sctp_addip_param) + sizeof(**errp)) return false; break; case SCTP_PARAM_SUCCESS_REPORT: case SCTP_PARAM_ADAPTATION_LAYER_IND: if (length != sizeof(struct sctp_addip_param)) return false; break; default: /* This is unknown to us, reject! */ return false; } } /* Remaining sanity checks. */ if (addr_param_needed && !addr_param_seen) return false; if (!addr_param_needed && addr_param_seen) return false; if (param.v != chunk->chunk_end) return false; return true; } /* Process an incoming ASCONF chunk with the next expected serial no. and * return an ASCONF_ACK chunk to be sent in response. */ struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, struct sctp_chunk *asconf) { union sctp_addr_param *addr_param; struct sctp_addip_chunk *addip; struct sctp_chunk *asconf_ack; bool all_param_pass = true; struct sctp_addiphdr *hdr; int length = 0, chunk_len; union sctp_params param; __be16 err_code; __u32 serial; addip = (struct sctp_addip_chunk *)asconf->chunk_hdr; chunk_len = ntohs(asconf->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); hdr = (struct sctp_addiphdr *)asconf->skb->data; serial = ntohl(hdr->serial); /* Skip the addiphdr and store a pointer to address parameter. */ length = sizeof(*hdr); addr_param = (union sctp_addr_param *)(asconf->skb->data + length); chunk_len -= length; /* Skip the address parameter and store a pointer to the first * asconf parameter. */ length = ntohs(addr_param->p.length); chunk_len -= length; /* create an ASCONF_ACK chunk. * Based on the definitions of parameters, we know that the size of * ASCONF_ACK parameters are less than or equal to the fourfold of ASCONF * parameters. */ asconf_ack = sctp_make_asconf_ack(asoc, serial, chunk_len * 4); if (!asconf_ack) goto done; /* Process the TLVs contained within the ASCONF chunk. */ sctp_walk_params(param, addip) { /* Skip preceding address parameters. */ if (param.p->type == SCTP_PARAM_IPV4_ADDRESS || param.p->type == SCTP_PARAM_IPV6_ADDRESS) continue; err_code = sctp_process_asconf_param(asoc, asconf, param.addip); /* ADDIP 4.1 A7) * If an error response is received for a TLV parameter, * all TLVs with no response before the failed TLV are * considered successful if not reported. All TLVs after * the failed response are considered unsuccessful unless * a specific success indication is present for the parameter. */ if (err_code != SCTP_ERROR_NO_ERROR) all_param_pass = false; if (!all_param_pass) sctp_add_asconf_response(asconf_ack, param.addip->crr_id, err_code, param.addip); /* ADDIP 4.3 D11) When an endpoint receiving an ASCONF to add * an IP address sends an 'Out of Resource' in its response, it * MUST also fail any subsequent add or delete requests bundled * in the ASCONF. */ if (err_code == SCTP_ERROR_RSRC_LOW) goto done; } done: asoc->peer.addip_serial++; /* If we are sending a new ASCONF_ACK hold a reference to it in assoc * after freeing the reference to old asconf ack if any. */ if (asconf_ack) { sctp_chunk_hold(asconf_ack); list_add_tail(&asconf_ack->transmitted_list, &asoc->asconf_ack_list); } return asconf_ack; } /* Process a asconf parameter that is successfully acked. */ static void sctp_asconf_param_success(struct sctp_association *asoc, struct sctp_addip_param *asconf_param) { struct sctp_bind_addr *bp = &asoc->base.bind_addr; union sctp_addr_param *addr_param; struct sctp_sockaddr_entry *saddr; struct sctp_transport *transport; union sctp_addr addr; struct sctp_af *af; addr_param = (void *)asconf_param + sizeof(*asconf_param); /* We have checked the packet before, so we do not check again. */ af = sctp_get_af_specific(param_type2af(addr_param->p.type)); if (!af->from_addr_param(&addr, addr_param, htons(bp->port), 0)) return; switch (asconf_param->param_hdr.type) { case SCTP_PARAM_ADD_IP: /* This is always done in BH context with a socket lock * held, so the list can not change. */ local_bh_disable(); list_for_each_entry(saddr, &bp->address_list, list) { if (sctp_cmp_addr_exact(&saddr->a, &addr)) saddr->state = SCTP_ADDR_SRC; } local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_dst_release(transport); } break; case SCTP_PARAM_DEL_IP: local_bh_disable(); sctp_del_bind_addr(bp, &addr); if (asoc->asconf_addr_del_pending != NULL && sctp_cmp_addr_exact(asoc->asconf_addr_del_pending, &addr)) { kfree(asoc->asconf_addr_del_pending); asoc->asconf_addr_del_pending = NULL; } local_bh_enable(); list_for_each_entry(transport, &asoc->peer.transport_addr_list, transports) { sctp_transport_dst_release(transport); } break; default: break; } } /* Get the corresponding ASCONF response error code from the ASCONF_ACK chunk * for the given asconf parameter. If there is no response for this parameter, * return the error code based on the third argument 'no_err'. * ADDIP 4.1 * A7) If an error response is received for a TLV parameter, all TLVs with no * response before the failed TLV are considered successful if not reported. * All TLVs after the failed response are considered unsuccessful unless a * specific success indication is present for the parameter. */ static __be16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack, struct sctp_addip_param *asconf_param, int no_err) { struct sctp_addip_param *asconf_ack_param; struct sctp_errhdr *err_param; int asconf_ack_len; __be16 err_code; int length; if (no_err) err_code = SCTP_ERROR_NO_ERROR; else err_code = SCTP_ERROR_REQ_REFUSED; asconf_ack_len = ntohs(asconf_ack->chunk_hdr->length) - sizeof(struct sctp_chunkhdr); /* Skip the addiphdr from the asconf_ack chunk and store a pointer to * the first asconf_ack parameter. */ length = sizeof(struct sctp_addiphdr); asconf_ack_param = (struct sctp_addip_param *)(asconf_ack->skb->data + length); asconf_ack_len -= length; while (asconf_ack_len > 0) { if (asconf_ack_param->crr_id == asconf_param->crr_id) { switch (asconf_ack_param->param_hdr.type) { case SCTP_PARAM_SUCCESS_REPORT: return SCTP_ERROR_NO_ERROR; case SCTP_PARAM_ERR_CAUSE: length = sizeof(*asconf_ack_param); err_param = (void *)asconf_ack_param + length; asconf_ack_len -= length; if (asconf_ack_len > 0) return err_param->cause; else return SCTP_ERROR_INV_PARAM; break; default: return SCTP_ERROR_INV_PARAM; } } length = ntohs(asconf_ack_param->param_hdr.length); asconf_ack_param = (void *)asconf_ack_param + length; asconf_ack_len -= length; } return err_code; } /* Process an incoming ASCONF_ACK chunk against the cached last ASCONF chunk. */ int sctp_process_asconf_ack(struct sctp_association *asoc, struct sctp_chunk *asconf_ack) { struct sctp_chunk *asconf = asoc->addip_last_asconf; struct sctp_addip_param *asconf_param; __be16 err_code = SCTP_ERROR_NO_ERROR; union sctp_addr_param *addr_param; int asconf_len = asconf->skb->len; int all_param_pass = 0; int length = 0; int no_err = 1; int retval = 0; /* Skip the chunkhdr and addiphdr from the last asconf sent and store * a pointer to address parameter. */ length = sizeof(struct sctp_addip_chunk); addr_param = (union sctp_addr_param *)(asconf->skb->data + length); asconf_len -= length; /* Skip the address parameter in the last asconf sent and store a * pointer to the first asconf parameter. */ length = ntohs(addr_param->p.length); asconf_param = (void *)addr_param + length; asconf_len -= length; /* ADDIP 4.1 * A8) If there is no response(s) to specific TLV parameter(s), and no * failures are indicated, then all request(s) are considered * successful. */ if (asconf_ack->skb->len == sizeof(struct sctp_addiphdr)) all_param_pass = 1; /* Process the TLVs contained in the last sent ASCONF chunk. */ while (asconf_len > 0) { if (all_param_pass) err_code = SCTP_ERROR_NO_ERROR; else { err_code = sctp_get_asconf_response(asconf_ack, asconf_param, no_err); if (no_err && (SCTP_ERROR_NO_ERROR != err_code)) no_err = 0; } switch (err_code) { case SCTP_ERROR_NO_ERROR: sctp_asconf_param_success(asoc, asconf_param); break; case SCTP_ERROR_RSRC_LOW: retval = 1; break; case SCTP_ERROR_UNKNOWN_PARAM: /* Disable sending this type of asconf parameter in * future. */ asoc->peer.addip_disabled_mask |= asconf_param->param_hdr.type; break; case SCTP_ERROR_REQ_REFUSED: case SCTP_ERROR_DEL_LAST_IP: case SCTP_ERROR_DEL_SRC_IP: default: break; } /* Skip the processed asconf parameter and move to the next * one. */ length = ntohs(asconf_param->param_hdr.length); asconf_param = (void *)asconf_param + length; asconf_len -= length; } if (no_err && asoc->src_out_of_asoc_ok) { asoc->src_out_of_asoc_ok = 0; sctp_transport_immediate_rtx(asoc->peer.primary_path); } /* Free the cached last sent asconf chunk. */ list_del_init(&asconf->transmitted_list); sctp_chunk_free(asconf); asoc->addip_last_asconf = NULL; return retval; } /* Make a FWD TSN chunk. */ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_fwdtsn_skip *skiplist) { struct sctp_chunk *retval = NULL; struct sctp_fwdtsn_hdr ftsn_hdr; struct sctp_fwdtsn_skip skip; size_t hint; int i; hint = (nstreams + 1) * sizeof(__u32); retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); retval->subh.fwdtsn_hdr = sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); for (i = 0; i < nstreams; i++) { skip.stream = skiplist[i].stream; skip.ssn = skiplist[i].ssn; sctp_addto_chunk(retval, sizeof(skip), &skip); } return retval; } struct sctp_chunk *sctp_make_ifwdtsn(const struct sctp_association *asoc, __u32 new_cum_tsn, size_t nstreams, struct sctp_ifwdtsn_skip *skiplist) { struct sctp_chunk *retval = NULL; struct sctp_ifwdtsn_hdr ftsn_hdr; size_t hint; hint = (nstreams + 1) * sizeof(__u32); retval = sctp_make_control(asoc, SCTP_CID_I_FWD_TSN, 0, hint, GFP_ATOMIC); if (!retval) return NULL; ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); retval->subh.ifwdtsn_hdr = sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); sctp_addto_chunk(retval, nstreams * sizeof(skiplist[0]), skiplist); return retval; } /* RE-CONFIG 3.1 (RE-CONFIG chunk) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Type = 130 | Chunk Flags | Chunk Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Re-configuration Parameter / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * \ \ * / Re-configuration Parameter (optional) / * \ \ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ static struct sctp_chunk *sctp_make_reconf(const struct sctp_association *asoc, int length) { struct sctp_reconf_chunk *reconf; struct sctp_chunk *retval; retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length, GFP_ATOMIC); if (!retval) return NULL; reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr; retval->param_hdr.v = (u8 *)(reconf + 1); return retval; } /* RE-CONFIG 4.1 (STREAM OUT RESET) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 13 | Parameter Length = 16 + 2 * N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sender's Last Assigned TSN | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number 1 (optional) | Stream Number 2 (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / ...... / * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number N-1 (optional) | Stream Number N (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * RE-CONFIG 4.2 (STREAM IN RESET) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 14 | Parameter Length = 8 + 2 * N | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number 1 (optional) | Stream Number 2 (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * / ...... / * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Stream Number N-1 (optional) | Stream Number N (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_req( const struct sctp_association *asoc, __u16 stream_num, __be16 *stream_list, bool out, bool in) { __u16 stream_len = stream_num * sizeof(__u16); struct sctp_strreset_outreq outreq; struct sctp_strreset_inreq inreq; struct sctp_chunk *retval; __u16 outlen, inlen; outlen = (sizeof(outreq) + stream_len) * out; inlen = (sizeof(inreq) + stream_len) * in; retval = sctp_make_reconf(asoc, SCTP_PAD4(outlen) + SCTP_PAD4(inlen)); if (!retval) return NULL; if (outlen) { outreq.param_hdr.type = SCTP_PARAM_RESET_OUT_REQUEST; outreq.param_hdr.length = htons(outlen); outreq.request_seq = htonl(asoc->strreset_outseq); outreq.response_seq = htonl(asoc->strreset_inseq - 1); outreq.send_reset_at_tsn = htonl(asoc->next_tsn - 1); sctp_addto_chunk(retval, sizeof(outreq), &outreq); if (stream_len) sctp_addto_chunk(retval, stream_len, stream_list); } if (inlen) { inreq.param_hdr.type = SCTP_PARAM_RESET_IN_REQUEST; inreq.param_hdr.length = htons(inlen); inreq.request_seq = htonl(asoc->strreset_outseq + out); sctp_addto_chunk(retval, sizeof(inreq), &inreq); if (stream_len) sctp_addto_chunk(retval, stream_len, stream_list); } return retval; } /* RE-CONFIG 4.3 (SSN/TSN RESET ALL) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 15 | Parameter Length = 8 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_tsnreq( const struct sctp_association *asoc) { struct sctp_strreset_tsnreq tsnreq; __u16 length = sizeof(tsnreq); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; tsnreq.param_hdr.type = SCTP_PARAM_RESET_TSN_REQUEST; tsnreq.param_hdr.length = htons(length); tsnreq.request_seq = htonl(asoc->strreset_outseq); sctp_addto_chunk(retval, sizeof(tsnreq), &tsnreq); return retval; } /* RE-CONFIG 4.5/4.6 (ADD STREAM) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 17 | Parameter Length = 12 | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Request Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Number of new streams | Reserved | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_addstrm( const struct sctp_association *asoc, __u16 out, __u16 in) { struct sctp_strreset_addstrm addstrm; __u16 size = sizeof(addstrm); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, (!!out + !!in) * size); if (!retval) return NULL; if (out) { addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_OUT_STREAMS; addstrm.param_hdr.length = htons(size); addstrm.number_of_streams = htons(out); addstrm.request_seq = htonl(asoc->strreset_outseq); addstrm.reserved = 0; sctp_addto_chunk(retval, size, &addstrm); } if (in) { addstrm.param_hdr.type = SCTP_PARAM_RESET_ADD_IN_STREAMS; addstrm.param_hdr.length = htons(size); addstrm.number_of_streams = htons(in); addstrm.request_seq = htonl(asoc->strreset_outseq + !!out); addstrm.reserved = 0; sctp_addto_chunk(retval, size, &addstrm); } return retval; } /* RE-CONFIG 4.4 (RESP) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 16 | Parameter Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Result | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_resp(const struct sctp_association *asoc, __u32 result, __u32 sn) { struct sctp_strreset_resp resp; __u16 length = sizeof(resp); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; resp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; resp.param_hdr.length = htons(length); resp.response_seq = htonl(sn); resp.result = htonl(result); sctp_addto_chunk(retval, sizeof(resp), &resp); return retval; } /* RE-CONFIG 4.4 OPTIONAL (TSNRESP) * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Parameter Type = 16 | Parameter Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Re-configuration Response Sequence Number | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Result | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Sender's Next TSN (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Receiver's Next TSN (optional) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ struct sctp_chunk *sctp_make_strreset_tsnresp(struct sctp_association *asoc, __u32 result, __u32 sn, __u32 sender_tsn, __u32 receiver_tsn) { struct sctp_strreset_resptsn tsnresp; __u16 length = sizeof(tsnresp); struct sctp_chunk *retval; retval = sctp_make_reconf(asoc, length); if (!retval) return NULL; tsnresp.param_hdr.type = SCTP_PARAM_RESET_RESPONSE; tsnresp.param_hdr.length = htons(length); tsnresp.response_seq = htonl(sn); tsnresp.result = htonl(result); tsnresp.senders_next_tsn = htonl(sender_tsn); tsnresp.receivers_next_tsn = htonl(receiver_tsn); sctp_addto_chunk(retval, sizeof(tsnresp), &tsnresp); return retval; } bool sctp_verify_reconf(const struct sctp_association *asoc, struct sctp_chunk *chunk, struct sctp_paramhdr **errp) { struct sctp_reconf_chunk *hdr; union sctp_params param; __be16 last = 0; __u16 cnt = 0; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; sctp_walk_params(param, hdr) { __u16 length = ntohs(param.p->length); *errp = param.p; if (cnt++ > 2) return false; switch (param.p->type) { case SCTP_PARAM_RESET_OUT_REQUEST: if (length < sizeof(struct sctp_strreset_outreq) || (last && last != SCTP_PARAM_RESET_RESPONSE && last != SCTP_PARAM_RESET_IN_REQUEST)) return false; break; case SCTP_PARAM_RESET_IN_REQUEST: if (length < sizeof(struct sctp_strreset_inreq) || (last && last != SCTP_PARAM_RESET_OUT_REQUEST)) return false; break; case SCTP_PARAM_RESET_RESPONSE: if ((length != sizeof(struct sctp_strreset_resp) && length != sizeof(struct sctp_strreset_resptsn)) || (last && last != SCTP_PARAM_RESET_RESPONSE && last != SCTP_PARAM_RESET_OUT_REQUEST)) return false; break; case SCTP_PARAM_RESET_TSN_REQUEST: if (length != sizeof(struct sctp_strreset_tsnreq) || last) return false; break; case SCTP_PARAM_RESET_ADD_IN_STREAMS: if (length != sizeof(struct sctp_strreset_addstrm) || (last && last != SCTP_PARAM_RESET_ADD_OUT_STREAMS)) return false; break; case SCTP_PARAM_RESET_ADD_OUT_STREAMS: if (length != sizeof(struct sctp_strreset_addstrm) || (last && last != SCTP_PARAM_RESET_ADD_IN_STREAMS)) return false; break; default: return false; } last = param.p->type; } return true; }
14 16 2 2 143 144 2 2 2 416 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0-only /* Expectation handling for nf_conntrack. */ /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2003,2004 USAGI/WIDE Project <http://www.linux-ipv6.org> * (c) 2005-2012 Patrick McHardy <kaber@trash.net> */ #include <linux/types.h> #include <linux/netfilter.h> #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/stddef.h> #include <linux/slab.h> #include <linux/err.h> #include <linux/percpu.h> #include <linux/kernel.h> #include <linux/siphash.h> #include <linux/moduleparam.h> #include <linux/export.h> #include <net/net_namespace.h> #include <net/netns/hash.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_tuple.h> #include <net/netfilter/nf_conntrack_zones.h> unsigned int nf_ct_expect_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hsize); struct hlist_head *nf_ct_expect_hash __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_expect_hash); unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; static siphash_aligned_key_t nf_ct_expect_hashrnd; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, u32 portid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); struct nf_conntrack_net *cnet; WARN_ON(!master_help); WARN_ON(timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); cnet = nf_ct_pernet(net); cnet->expect_count--; hlist_del_rcu(&exp->lnode); master_help->expecting[exp->class]--; nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); } EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); static void nf_ct_expectation_timed_out(struct timer_list *t) { struct nf_conntrack_expect *exp = timer_container_of(exp, t, timeout); spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_unlink_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_put(exp); } static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { struct { union nf_inet_addr dst_addr; u32 net_mix; u16 dport; u8 l3num; u8 protonum; } __aligned(SIPHASH_ALIGNMENT) combined; u32 hash; get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); memset(&combined, 0, sizeof(combined)); combined.dst_addr = tuple->dst.u3; combined.net_mix = net_hash_mix(n); combined.dport = (__force __u16)tuple->dst.u.all; combined.l3num = tuple->src.l3num; combined.protonum = tuple->dst.protonum; hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd); return reciprocal_scale(hash, nf_ct_expect_hsize); } static bool nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_expect *i, const struct nf_conntrack_zone *zone, const struct net *net) { return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) && net_eq(net, nf_ct_net(i->master)) && nf_ct_zone_equal_any(i->master, zone); } bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) { if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); return true; } return false; } EXPORT_SYMBOL_GPL(nf_ct_remove_expect); struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i; unsigned int h; if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) { if (nf_ct_exp_equal(tuple, i, zone, net)) return i; } return NULL; } EXPORT_SYMBOL_GPL(__nf_ct_expect_find); /* Just find a expectation corresponding to a tuple. */ struct nf_conntrack_expect * nf_ct_expect_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_expect *i; rcu_read_lock(); i = __nf_ct_expect_find(net, zone, tuple); if (i && !refcount_inc_not_zero(&i->use)) i = NULL; rcu_read_unlock(); return i; } EXPORT_SYMBOL_GPL(nf_ct_expect_find_get); /* If an expectation for this connection is found, it gets delete from * global list then returned. */ struct nf_conntrack_expect * nf_ct_find_expectation(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, bool unlink) { struct nf_conntrack_net *cnet = nf_ct_pernet(net); struct nf_conntrack_expect *i, *exp = NULL; unsigned int h; if (!cnet->expect_count) return NULL; h = nf_ct_expect_dst_hash(net, tuple); hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) { if (!(i->flags & NF_CT_EXPECT_INACTIVE) && nf_ct_exp_equal(tuple, i, zone, net)) { exp = i; break; } } if (!exp) return NULL; /* If master is not in hash table yet (ie. packet hasn't left this machine yet), how can other end know about expected? Hence these are not the droids you are looking for (if master ct never got confirmed, we'd hold a reference to it and weird things would happen to future packets). */ if (!nf_ct_is_confirmed(exp->master)) return NULL; /* Avoid race with other CPUs, that for exp->master ct, is * about to invoke ->destroy(), or nf_ct_delete() via timeout * or early_drop(). * * The refcount_inc_not_zero() check tells: If that fails, we * know that the ct is being destroyed. If it succeeds, we * can be sure the ct cannot disappear underneath. */ if (unlikely(nf_ct_is_dying(exp->master) || !refcount_inc_not_zero(&exp->master->ct_general.use))) return NULL; if (exp->flags & NF_CT_EXPECT_PERMANENT || !unlink) { refcount_inc(&exp->use); return exp; } else if (timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); return exp; } /* Undo exp->master refcnt increase, if timer_delete() failed */ nf_ct_put(exp->master); return NULL; } /* delete all expectations for this conntrack */ void nf_ct_remove_expectations(struct nf_conn *ct) { struct nf_conn_help *help = nfct_help(ct); struct nf_conntrack_expect *exp; struct hlist_node *next; /* Optimization: most connection never expect any others. */ if (!help) return; spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { nf_ct_remove_expect(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_remove_expectations); /* Would two expected things clash? */ static inline int expect_clash(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { /* Part covered by intersection of masks must be unequal, otherwise they clash */ struct nf_conntrack_tuple_mask intersect_mask; int count; intersect_mask.src.u.all = a->mask.src.u.all & b->mask.src.u.all; for (count = 0; count < NF_CT_TUPLE_L3SIZE; count++){ intersect_mask.src.u3.all[count] = a->mask.src.u3.all[count] & b->mask.src.u3.all[count]; } return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static inline int expect_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b) { return nf_ct_tuple_equal(&a->tuple, &b->tuple) && nf_ct_tuple_mask_equal(&a->mask, &b->mask) && net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) && nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master)); } static bool master_matches(const struct nf_conntrack_expect *a, const struct nf_conntrack_expect *b, unsigned int flags) { if (flags & NF_CT_EXP_F_SKIP_MASTER) return true; return a->master == b->master; } /* Generally a bad idea to call this: could have matched already. */ void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) { spin_lock_bh(&nf_conntrack_expect_lock); nf_ct_remove_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); /* We don't increase the master conntrack refcount for non-fulfilled * conntracks. During the conntrack destruction, the expectations are * always killed before the conntrack itself */ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) { struct nf_conntrack_expect *new; new = kmem_cache_alloc(nf_ct_expect_cachep, GFP_ATOMIC); if (!new) return NULL; new->master = me; refcount_set(&new->use, 1); return new; } EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); void nf_ct_expect_init(struct nf_conntrack_expect *exp, unsigned int class, u_int8_t family, const union nf_inet_addr *saddr, const union nf_inet_addr *daddr, u_int8_t proto, const __be16 *src, const __be16 *dst) { int len; if (family == AF_INET) len = 4; else len = 16; exp->flags = 0; exp->class = class; exp->expectfn = NULL; exp->helper = NULL; exp->tuple.src.l3num = family; exp->tuple.dst.protonum = proto; if (saddr) { memcpy(&exp->tuple.src.u3, saddr, len); if (sizeof(exp->tuple.src.u3) > len) /* address needs to be cleared for nf_ct_tuple_equal */ memset((void *)&exp->tuple.src.u3 + len, 0x00, sizeof(exp->tuple.src.u3) - len); memset(&exp->mask.src.u3, 0xFF, len); if (sizeof(exp->mask.src.u3) > len) memset((void *)&exp->mask.src.u3 + len, 0x00, sizeof(exp->mask.src.u3) - len); } else { memset(&exp->tuple.src.u3, 0x00, sizeof(exp->tuple.src.u3)); memset(&exp->mask.src.u3, 0x00, sizeof(exp->mask.src.u3)); } if (src) { exp->tuple.src.u.all = *src; exp->mask.src.u.all = htons(0xFFFF); } else { exp->tuple.src.u.all = 0; exp->mask.src.u.all = 0; } memcpy(&exp->tuple.dst.u3, daddr, len); if (sizeof(exp->tuple.dst.u3) > len) /* address needs to be cleared for nf_ct_tuple_equal */ memset((void *)&exp->tuple.dst.u3 + len, 0x00, sizeof(exp->tuple.dst.u3) - len); exp->tuple.dst.u.all = *dst; #if IS_ENABLED(CONFIG_NF_NAT) memset(&exp->saved_addr, 0, sizeof(exp->saved_addr)); memset(&exp->saved_proto, 0, sizeof(exp->saved_proto)); #endif } EXPORT_SYMBOL_GPL(nf_ct_expect_init); static void nf_ct_expect_free_rcu(struct rcu_head *head) { struct nf_conntrack_expect *exp; exp = container_of(head, struct nf_conntrack_expect, rcu); kmem_cache_free(nf_ct_expect_cachep, exp); } void nf_ct_expect_put(struct nf_conntrack_expect *exp) { if (refcount_dec_and_test(&exp->use)) call_rcu(&exp->rcu, nf_ct_expect_free_rcu); } EXPORT_SYMBOL_GPL(nf_ct_expect_put); static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) { struct nf_conntrack_net *cnet; struct nf_conn_help *master_help = nfct_help(exp->master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(exp); unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple); /* two references : one for hash insert, one for the timer */ refcount_add(2, &exp->use); timer_setup(&exp->timeout, nf_ct_expectation_timed_out, 0); helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { exp->timeout.expires = jiffies + helper->expect_policy[exp->class].timeout * HZ; } add_timer(&exp->timeout); hlist_add_head_rcu(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]); cnet = nf_ct_pernet(net); cnet->expect_count++; NF_CT_STAT_INC(net, expect_create); } /* Race with expectations being used means we could have none to find; OK. */ static void evict_oldest_expect(struct nf_conn *master, struct nf_conntrack_expect *new) { struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_expect *exp, *last = NULL; hlist_for_each_entry(exp, &master_help->expectations, lnode) { if (exp->class == new->class) last = exp; } if (last) nf_ct_remove_expect(last); } static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect, unsigned int flags) { const struct nf_conntrack_expect_policy *p; struct nf_conntrack_expect *i; struct nf_conntrack_net *cnet; struct nf_conn *master = expect->master; struct nf_conn_help *master_help = nfct_help(master); struct nf_conntrack_helper *helper; struct net *net = nf_ct_exp_net(expect); struct hlist_node *next; unsigned int h; int ret = 0; if (!master_help) { ret = -ESHUTDOWN; goto out; } h = nf_ct_expect_dst_hash(net, &expect->tuple); hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (master_matches(i, expect, flags) && expect_matches(i, expect)) { if (i->class != expect->class || i->master != expect->master) return -EALREADY; if (nf_ct_remove_expect(i)) break; } else if (expect_clash(i, expect)) { ret = -EBUSY; goto out; } } /* Will be over limit? */ helper = rcu_dereference_protected(master_help->helper, lockdep_is_held(&nf_conntrack_expect_lock)); if (helper) { p = &helper->expect_policy[expect->class]; if (p->max_expected && master_help->expecting[expect->class] >= p->max_expected) { evict_oldest_expect(master, expect); if (master_help->expecting[expect->class] >= p->max_expected) { ret = -EMFILE; goto out; } } } cnet = nf_ct_pernet(net); if (cnet->expect_count >= nf_ct_expect_max) { net_warn_ratelimited("nf_conntrack: expectation table full\n"); ret = -EMFILE; } out: return ret; } int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, u32 portid, int report, unsigned int flags) { int ret; spin_lock_bh(&nf_conntrack_expect_lock); ret = __nf_ct_expect_check(expect, flags); if (ret < 0) goto out; nf_ct_expect_insert(expect); spin_unlock_bh(&nf_conntrack_expect_lock); nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report); return 0; out: spin_unlock_bh(&nf_conntrack_expect_lock); return ret; } EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); void nf_ct_expect_iterate_destroy(bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data) { struct nf_conntrack_expect *exp; const struct hlist_node *next; unsigned int i; spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { if (iter(exp, data) && timer_delete(&exp->timeout)) { nf_ct_unlink_expect(exp); nf_ct_expect_put(exp); } } } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_destroy); void nf_ct_expect_iterate_net(struct net *net, bool (*iter)(struct nf_conntrack_expect *e, void *data), void *data, u32 portid, int report) { struct nf_conntrack_expect *exp; const struct hlist_node *next; unsigned int i; spin_lock_bh(&nf_conntrack_expect_lock); for (i = 0; i < nf_ct_expect_hsize; i++) { hlist_for_each_entry_safe(exp, next, &nf_ct_expect_hash[i], hnode) { if (!net_eq(nf_ct_exp_net(exp), net)) continue; if (iter(exp, data) && timer_delete(&exp->timeout)) { nf_ct_unlink_expect_report(exp, portid, report); nf_ct_expect_put(exp); } } } spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_expect_iterate_net); #ifdef CONFIG_NF_CONNTRACK_PROCFS struct ct_expect_iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *ct_expect_get_first(struct seq_file *seq) { struct ct_expect_iter_state *st = seq->private; struct hlist_node *n; for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) { n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); if (n) return n; } return NULL; } static struct hlist_node *ct_expect_get_next(struct seq_file *seq, struct hlist_node *head) { struct ct_expect_iter_state *st = seq->private; head = rcu_dereference(hlist_next_rcu(head)); while (head == NULL) { if (++st->bucket >= nf_ct_expect_hsize) return NULL; head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket])); } return head; } static struct hlist_node *ct_expect_get_idx(struct seq_file *seq, loff_t pos) { struct hlist_node *head = ct_expect_get_first(seq); if (head) while (pos && (head = ct_expect_get_next(seq, head))) pos--; return pos ? NULL : head; } static void *exp_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { rcu_read_lock(); return ct_expect_get_idx(seq, *pos); } static void *exp_seq_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return ct_expect_get_next(seq, v); } static void exp_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int exp_seq_show(struct seq_file *s, void *v) { struct nf_conntrack_expect *expect; struct nf_conntrack_helper *helper; struct hlist_node *n = v; char *delim = ""; expect = hlist_entry(n, struct nf_conntrack_expect, hnode); if (expect->timeout.function) seq_printf(s, "%ld ", timer_pending(&expect->timeout) ? (long)(expect->timeout.expires - jiffies)/HZ : 0); else seq_puts(s, "- "); seq_printf(s, "l3proto = %u proto=%u ", expect->tuple.src.l3num, expect->tuple.dst.protonum); print_tuple(s, &expect->tuple, nf_ct_l4proto_find(expect->tuple.dst.protonum)); if (expect->flags & NF_CT_EXPECT_PERMANENT) { seq_puts(s, "PERMANENT"); delim = ","; } if (expect->flags & NF_CT_EXPECT_INACTIVE) { seq_printf(s, "%sINACTIVE", delim); delim = ","; } if (expect->flags & NF_CT_EXPECT_USERSPACE) seq_printf(s, "%sUSERSPACE", delim); helper = rcu_dereference(nfct_help(expect->master)->helper); if (helper) { seq_printf(s, "%s%s", expect->flags ? " " : "", helper->name); if (helper->expect_policy[expect->class].name[0]) seq_printf(s, "/%s", helper->expect_policy[expect->class].name); } seq_putc(s, '\n'); return 0; } static const struct seq_operations exp_seq_ops = { .start = exp_seq_start, .next = exp_seq_next, .stop = exp_seq_stop, .show = exp_seq_show }; #endif /* CONFIG_NF_CONNTRACK_PROCFS */ static int exp_proc_init(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS struct proc_dir_entry *proc; kuid_t root_uid; kgid_t root_gid; proc = proc_create_net("nf_conntrack_expect", 0440, net->proc_net, &exp_seq_ops, sizeof(struct ct_expect_iter_state)); if (!proc) return -ENOMEM; root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ return 0; } static void exp_proc_remove(struct net *net) { #ifdef CONFIG_NF_CONNTRACK_PROCFS remove_proc_entry("nf_conntrack_expect", net->proc_net); #endif /* CONFIG_NF_CONNTRACK_PROCFS */ } module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_pernet_init(struct net *net) { return exp_proc_init(net); } void nf_conntrack_expect_pernet_fini(struct net *net) { exp_proc_remove(net); } int nf_conntrack_expect_init(void) { if (!nf_ct_expect_hsize) { nf_ct_expect_hsize = nf_conntrack_htable_size / 256; if (!nf_ct_expect_hsize) nf_ct_expect_hsize = 1; } nf_ct_expect_max = nf_ct_expect_hsize * 4; nf_ct_expect_cachep = KMEM_CACHE(nf_conntrack_expect, 0); if (!nf_ct_expect_cachep) return -ENOMEM; nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0); if (!nf_ct_expect_hash) { kmem_cache_destroy(nf_ct_expect_cachep); return -ENOMEM; } return 0; } void nf_conntrack_expect_fini(void) { rcu_barrier(); /* Wait for call_rcu() before destroy */ kmem_cache_destroy(nf_ct_expect_cachep); kvfree(nf_ct_expect_hash); }
561 562 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 // SPDX-License-Identifier: GPL-2.0-only /* * crc16.c */ #include <linux/crc16.h> #include <linux/export.h> #include <linux/module.h> #include <linux/types.h> /** CRC table for the CRC-16. The poly is 0x8005 (x^16 + x^15 + x^2 + 1) */ static const u16 crc16_table[256] = { 0x0000, 0xC0C1, 0xC181, 0x0140, 0xC301, 0x03C0, 0x0280, 0xC241, 0xC601, 0x06C0, 0x0780, 0xC741, 0x0500, 0xC5C1, 0xC481, 0x0440, 0xCC01, 0x0CC0, 0x0D80, 0xCD41, 0x0F00, 0xCFC1, 0xCE81, 0x0E40, 0x0A00, 0xCAC1, 0xCB81, 0x0B40, 0xC901, 0x09C0, 0x0880, 0xC841, 0xD801, 0x18C0, 0x1980, 0xD941, 0x1B00, 0xDBC1, 0xDA81, 0x1A40, 0x1E00, 0xDEC1, 0xDF81, 0x1F40, 0xDD01, 0x1DC0, 0x1C80, 0xDC41, 0x1400, 0xD4C1, 0xD581, 0x1540, 0xD701, 0x17C0, 0x1680, 0xD641, 0xD201, 0x12C0, 0x1380, 0xD341, 0x1100, 0xD1C1, 0xD081, 0x1040, 0xF001, 0x30C0, 0x3180, 0xF141, 0x3300, 0xF3C1, 0xF281, 0x3240, 0x3600, 0xF6C1, 0xF781, 0x3740, 0xF501, 0x35C0, 0x3480, 0xF441, 0x3C00, 0xFCC1, 0xFD81, 0x3D40, 0xFF01, 0x3FC0, 0x3E80, 0xFE41, 0xFA01, 0x3AC0, 0x3B80, 0xFB41, 0x3900, 0xF9C1, 0xF881, 0x3840, 0x2800, 0xE8C1, 0xE981, 0x2940, 0xEB01, 0x2BC0, 0x2A80, 0xEA41, 0xEE01, 0x2EC0, 0x2F80, 0xEF41, 0x2D00, 0xEDC1, 0xEC81, 0x2C40, 0xE401, 0x24C0, 0x2580, 0xE541, 0x2700, 0xE7C1, 0xE681, 0x2640, 0x2200, 0xE2C1, 0xE381, 0x2340, 0xE101, 0x21C0, 0x2080, 0xE041, 0xA001, 0x60C0, 0x6180, 0xA141, 0x6300, 0xA3C1, 0xA281, 0x6240, 0x6600, 0xA6C1, 0xA781, 0x6740, 0xA501, 0x65C0, 0x6480, 0xA441, 0x6C00, 0xACC1, 0xAD81, 0x6D40, 0xAF01, 0x6FC0, 0x6E80, 0xAE41, 0xAA01, 0x6AC0, 0x6B80, 0xAB41, 0x6900, 0xA9C1, 0xA881, 0x6840, 0x7800, 0xB8C1, 0xB981, 0x7940, 0xBB01, 0x7BC0, 0x7A80, 0xBA41, 0xBE01, 0x7EC0, 0x7F80, 0xBF41, 0x7D00, 0xBDC1, 0xBC81, 0x7C40, 0xB401, 0x74C0, 0x7580, 0xB541, 0x7700, 0xB7C1, 0xB681, 0x7640, 0x7200, 0xB2C1, 0xB381, 0x7340, 0xB101, 0x71C0, 0x7080, 0xB041, 0x5000, 0x90C1, 0x9181, 0x5140, 0x9301, 0x53C0, 0x5280, 0x9241, 0x9601, 0x56C0, 0x5780, 0x9741, 0x5500, 0x95C1, 0x9481, 0x5440, 0x9C01, 0x5CC0, 0x5D80, 0x9D41, 0x5F00, 0x9FC1, 0x9E81, 0x5E40, 0x5A00, 0x9AC1, 0x9B81, 0x5B40, 0x9901, 0x59C0, 0x5880, 0x9841, 0x8801, 0x48C0, 0x4980, 0x8941, 0x4B00, 0x8BC1, 0x8A81, 0x4A40, 0x4E00, 0x8EC1, 0x8F81, 0x4F40, 0x8D01, 0x4DC0, 0x4C80, 0x8C41, 0x4400, 0x84C1, 0x8581, 0x4540, 0x8701, 0x47C0, 0x4680, 0x8641, 0x8201, 0x42C0, 0x4380, 0x8341, 0x4100, 0x81C1, 0x8081, 0x4040 }; /** * crc16 - compute the CRC-16 for the data buffer * @crc: previous CRC value * @p: data pointer * @len: number of bytes in the buffer * * Returns the updated CRC value. */ u16 crc16(u16 crc, const u8 *p, size_t len) { while (len--) crc = (crc >> 8) ^ crc16_table[(crc & 0xff) ^ *p++]; return crc; } EXPORT_SYMBOL(crc16); MODULE_DESCRIPTION("CRC16 calculations"); MODULE_LICENSE("GPL");
238 9 115 124 97 248 1 115 124 121 24 1 23 133 127 6 122 3 11 133 93 62 93 56 9 1 65 1 187 135 184 368 2 184 25 319 30 30 30 52 52 46 136 43 133 43 43 81 30 156 154 154 4 60 3 1 49 18 87 151 3 2 3 3 78 104 185 52 17 156 5 57 94 58 94 5 150 108 45 15 2 24 440 2 440 186 263 23 23 21 161 5 286 19 456 315 453 286 3 143 313 451 21 22 3 14 5 5 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/file.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/file.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext4 fs regular file handling primitives * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) */ #include <linux/time.h> #include <linux/fs.h> #include <linux/iomap.h> #include <linux/mount.h> #include <linux/path.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/pagevec.h> #include <linux/uio.h> #include <linux/mman.h> #include <linux/backing-dev.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" /* * Returns %true if the given DIO request should be attempted with DIO, or * %false if it should fall back to buffered I/O. * * DIO isn't well specified; when it's unsupported (either due to the request * being misaligned, or due to the file not supporting DIO at all), filesystems * either fall back to buffered I/O or return EINVAL. For files that don't use * any special features like encryption or verity, ext4 has traditionally * returned EINVAL for misaligned DIO. iomap_dio_rw() uses this convention too. * In this case, we should attempt the DIO, *not* fall back to buffered I/O. * * In contrast, in cases where DIO is unsupported due to ext4 features, ext4 * traditionally falls back to buffered I/O. * * This function implements the traditional ext4 behavior in all these cases. */ static bool ext4_should_use_dio(struct kiocb *iocb, struct iov_iter *iter) { struct inode *inode = file_inode(iocb->ki_filp); u32 dio_align = ext4_dio_alignment(inode); if (dio_align == 0) return false; if (dio_align == 1) return true; return IS_ALIGNED(iocb->ki_pos | iov_iter_alignment(iter), dio_align); } static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } if (!ext4_should_use_dio(iocb, to)) { inode_unlock_shared(inode); /* * Fallback to buffered I/O if the operation being performed on * the inode is not supported by direct I/O. The IOCB_DIRECT * flag needs to be cleared here in order to ensure that the * direct I/O path within generic_file_read_iter() is not * taken. */ iocb->ki_flags &= ~IOCB_DIRECT; return generic_file_read_iter(iocb, to); } ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { inode_lock_shared(inode); } /* * Recheck under inode lock - at this point we are sure it cannot * change anymore */ if (!IS_DAX(inode)) { inode_unlock_shared(inode); /* Fallback to buffered IO in case we cannot support DAX */ return generic_file_read_iter(iocb, to); } ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); return ret; } #endif static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct inode *inode = file_inode(iocb->ki_filp); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (!iov_iter_count(to)) return 0; /* skip atime */ #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_read_iter(iocb, to); #endif if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_read_iter(iocb, to); return generic_file_read_iter(iocb, to); } static ssize_t ext4_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct inode *inode = file_inode(in); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; return filemap_splice_read(in, ppos, pipe, len, flags); } /* * Called when an inode is released. Note that this is different * from ext4_file_open: open gets called at every open, but release * gets called only when /all/ the files are closed. */ static int ext4_release_file(struct inode *inode, struct file *filp) { if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) { ext4_alloc_da_blocks(inode); ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); } /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1) && !EXT4_I(inode)->i_reserved_data_blocks) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) ext4_htree_free_dir_info(filp->private_data); return 0; } /* * This tests whether the IO in question is block-aligned or not. * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they * are converted to written only after the IO is complete. Until they are * mapped, these blocks appear as holes, so dio_zero_block() will assume that * it needs to zero out portions of the start and/or end block. If 2 AIO * threads are at work on the same unwritten block, they must be synchronized * or one thread will zero the other's data, causing corruption. */ static bool ext4_unaligned_io(struct inode *inode, struct iov_iter *from, loff_t pos) { struct super_block *sb = inode->i_sb; unsigned long blockmask = sb->s_blocksize - 1; if ((pos | iov_iter_alignment(from)) & blockmask) return true; return false; } static bool ext4_extending_io(struct inode *inode, loff_t offset, size_t len) { if (offset + len > i_size_read(inode) || offset + len > EXT4_I(inode)->i_disksize) return true; return false; } /* Is IO overwriting allocated or initialized blocks? */ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len, bool *unwritten) { struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; int err, blklen; if (pos + len > i_size_read(inode)) return false; map.m_lblk = pos >> blkbits; map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); blklen = map.m_len; err = ext4_map_blocks(NULL, inode, &map, 0); if (err != blklen) return false; /* * 'err==len' means that all of the blocks have been preallocated, * regardless of whether they have been initialized or not. We need to * check m_flags to distinguish the unwritten extents. */ *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); return true; } static ssize_t ext4_generic_write_checks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); ssize_t ret; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; ret = generic_write_checks(iocb, from); if (ret <= 0) return ret; /* * If we have encountered a bitmap-format file, the size limit * is smaller than s_maxbytes, which is for extent-mapped files. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) return -EFBIG; iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } return iov_iter_count(from); } static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret, count; count = ext4_generic_write_checks(iocb, from); if (count <= 0) return count; ret = file_modified(iocb->ki_filp); if (ret) return ret; return count; } static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) return -EOPNOTSUPP; inode_lock(inode); ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; ret = generic_perform_write(iocb, from); out: inode_unlock(inode); if (unlikely(ret <= 0)) return ret; return generic_write_sync(iocb, ret); } static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, ssize_t written, ssize_t count) { handle_t *handle; lockdep_assert_held_write(&inode->i_rwsem); handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return PTR_ERR(handle); if (ext4_update_inode_size(inode, offset + written)) { int ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) { ext4_journal_stop(handle); return ret; } } if ((written == count) && inode->i_nlink) ext4_orphan_del(handle, inode); ext4_journal_stop(handle); return written; } /* * Clean up the inode after DIO or DAX extending write has completed and the * inode size has been updated using ext4_handle_inode_extension(). */ static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) { lockdep_assert_held_write(&inode->i_rwsem); if (need_trunc) { ext4_truncate_failed_write(inode); /* * If the truncate operation failed early, then the inode may * still be on the orphan list. In that case, we need to try * remove the inode from the in-memory linked list. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); return; } /* * If i_disksize got extended either due to writeback of delalloc * blocks or extending truncate while the DIO was running we could fail * to cleanup the orphan list in ext4_handle_inode_extension(). Do it * now. */ if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) { handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { /* * The write has successfully completed. Not much to * do with the error here so just cleanup the orphan * list and hope for the best. */ ext4_orphan_del(NULL, inode); return; } ext4_orphan_del(handle, inode); ext4_journal_stop(handle); } } static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && (iocb->ki_flags & IOCB_ATOMIC)) error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, size); else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error) return error; /* * Note that EXT4_I(inode)->i_disksize can get extended up to * inode->i_size while the I/O was running due to writeback of delalloc * blocks. But the code in ext4_iomap_alloc() is careful to use * zeroed/unwritten extents if this is possible; thus we won't leave * uninitialized blocks in a file even if we didn't succeed in writing * as much as we intended. Also we can race with truncate or write * expanding the file so we have to be a bit careful here. */ if (pos + size <= READ_ONCE(EXT4_I(inode)->i_disksize) && pos + size <= i_size_read(inode)) return 0; error = ext4_handle_inode_extension(inode, pos, size, size); return error < 0 ? error : 0; } static const struct iomap_dio_ops ext4_dio_write_ops = { .end_io = ext4_dio_write_end_io, }; /* * The intention here is to start with shared lock acquired then see if any * condition requires an exclusive inode lock. If yes, then we restart the * whole operation by releasing the shared lock and acquiring exclusive lock. * * - For unaligned_io we never take shared lock as it may cause data corruption * when two unaligned IO tries to modify the same block e.g. while zeroing. * * - For extending writes case we don't take the shared lock, since it requires * updating inode i_disksize and/or orphan handling with exclusive lock. * * - shared locking will only be true mostly with overwrites, including * initialized blocks and unwritten blocks. For overwrite unwritten blocks * we protect splitting extents by i_data_sem in ext4_inode_info, so we can * also release exclusive i_rwsem lock. * * - Otherwise we will switch to exclusive i_rwsem lock. */ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, bool *ilock_shared, bool *extend, bool *unwritten, int *dio_flags) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); loff_t offset; size_t count; ssize_t ret; bool overwrite, unaligned_io; restart: ret = ext4_generic_write_checks(iocb, from); if (ret <= 0) goto out; offset = iocb->ki_pos; count = ret; unaligned_io = ext4_unaligned_io(inode, from, offset); *extend = ext4_extending_io(inode, offset, count); overwrite = ext4_overwrite_io(inode, offset, count, unwritten); /* * Determine whether we need to upgrade to an exclusive lock. This is * required to change security info in file_modified(), for extending * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten * extents (as partial block zeroing may be required). * * Note that unaligned writes are allowed under shared lock so long as * they are pure overwrites. Otherwise, concurrent unaligned writes risk * data corruption due to partial block zeroing in the dio layer, and so * the I/O must occur exclusively. */ if (*ilock_shared && ((!IS_NOSEC(inode) || *extend || !overwrite || (unaligned_io && *unwritten)))) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } inode_unlock_shared(inode); *ilock_shared = false; inode_lock(inode); goto restart; } /* * Now that locking is settled, determine dio flags and exclusivity * requirements. We don't use DIO_OVERWRITE_ONLY because we enforce * behavior already. The inode lock is already held exclusive if the * write is non-overwrite or extending, so drain all outstanding dio and * set the force wait dio flag. */ if (!*ilock_shared && (unaligned_io || *extend)) { if (iocb->ki_flags & IOCB_NOWAIT) { ret = -EAGAIN; goto out; } if (unaligned_io && (!overwrite || *unwritten)) inode_dio_wait(inode); *dio_flags = IOMAP_DIO_FORCE_WAIT; } ret = file_modified(file); if (ret < 0) goto out; return count; out: if (*ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); return ret; } static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; handle_t *handle; struct inode *inode = file_inode(iocb->ki_filp); loff_t offset = iocb->ki_pos; size_t count = iov_iter_count(from); const struct iomap_ops *iomap_ops = &ext4_iomap_ops; bool extend = false, unwritten = false; bool ilock_shared = true; int dio_flags = 0; /* * Quick check here without any i_rwsem lock to see if it is extending * IO. A more reliable check is done in ext4_dio_write_checks() with * proper locking in place. */ if (offset + count > i_size_read(inode)) ilock_shared = false; if (iocb->ki_flags & IOCB_NOWAIT) { if (ilock_shared) { if (!inode_trylock_shared(inode)) return -EAGAIN; } else { if (!inode_trylock(inode)) return -EAGAIN; } } else { if (ilock_shared) inode_lock_shared(inode); else inode_lock(inode); } /* Fallback to buffered I/O if the inode does not support direct I/O. */ if (!ext4_should_use_dio(iocb, from)) { if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); return ext4_buffered_write_iter(iocb, from); } /* * Prevent inline data from being created since we are going to allocate * blocks for DIO. We know the inode does not currently have inline data * because ext4_should_use_dio() checked for it, but we have to clear * the state flag before the write checks because a lock cycle could * introduce races with other writers. */ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend, &unwritten, &dio_flags); if (ret <= 0) return ret; offset = iocb->ki_pos; count = ret; if (extend) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (ret) goto out; } if (ilock_shared && !unwritten) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, dio_flags, NULL, 0); if (ret == -ENOTBLK) ret = 0; if (extend) { /* * We always perform extending DIO write synchronously so by * now the IO is completed and ext4_handle_inode_extension() * was called. Cleanup the inode in case of error or race with * writeback of delalloc blocks. */ WARN_ON_ONCE(ret == -EIOCBQUEUED); ext4_inode_extension_cleanup(inode, ret < 0); } out: if (ilock_shared) inode_unlock_shared(inode); else inode_unlock(inode); if (ret >= 0 && iov_iter_count(from)) { ssize_t err; loff_t endbyte; /* * There is no support for atomic writes on buffered-io yet, * we should never fallback to buffered-io for DIO atomic * writes. */ WARN_ON_ONCE(iocb->ki_flags & IOCB_ATOMIC); offset = iocb->ki_pos; err = ext4_buffered_write_iter(iocb, from); if (err < 0) return err; /* * We need to ensure that the pages within the page cache for * the range covered by this I/O are written to disk and * invalidated. This is in attempt to preserve the expected * direct I/O semantics in the case we fallback to buffered I/O * to complete off the I/O request. */ ret += err; endbyte = offset + err - 1; err = filemap_write_and_wait_range(iocb->ki_filp->f_mapping, offset, endbyte); if (!err) invalidate_mapping_pages(iocb->ki_filp->f_mapping, offset >> PAGE_SHIFT, endbyte >> PAGE_SHIFT); } return ret; } #ifdef CONFIG_FS_DAX static ssize_t ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) { ssize_t ret; size_t count; loff_t offset; handle_t *handle; bool extend = false; struct inode *inode = file_inode(iocb->ki_filp); if (iocb->ki_flags & IOCB_NOWAIT) { if (!inode_trylock(inode)) return -EAGAIN; } else { inode_lock(inode); } ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; offset = iocb->ki_pos; count = iov_iter_count(from); if (offset + count > EXT4_I(inode)->i_disksize) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; } ret = ext4_orphan_add(handle, inode); if (ret) { ext4_journal_stop(handle); goto out; } extend = true; ext4_journal_stop(handle); } ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); if (extend) { ret = ext4_handle_inode_extension(inode, offset, ret, count); ext4_inode_extension_cleanup(inode, ret < (ssize_t)count); } out: inode_unlock(inode); if (ret > 0) ret = generic_write_sync(iocb, ret); return ret; } #endif static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { int ret; struct inode *inode = file_inode(iocb->ki_filp); ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) return ext4_dax_write_iter(iocb, from); #endif if (iocb->ki_flags & IOCB_ATOMIC) { size_t len = iov_iter_count(from); if (len < EXT4_SB(inode->i_sb)->s_awu_min || len > EXT4_SB(inode->i_sb)->s_awu_max) return -EINVAL; ret = generic_atomic_write_valid(iocb, from); if (ret) return ret; } if (iocb->ki_flags & IOCB_DIRECT) return ext4_dio_write_iter(iocb, from); else return ext4_buffered_write_iter(iocb, from); } #ifdef CONFIG_FS_DAX static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf, unsigned int order) { int error = 0; vm_fault_t result; int retries = 0; handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; /* * We have to distinguish real writes from writes which will result in a * COW page; COW writes should *not* poke the journal (the file will not * be changed). Doing so would cause unintended failures when mounted * read-only. * * We check for VM_SHARED rather than vmf->cow_page since the latter is * unset for order != 0 (i.e. only in do_cow_fault); for * other sizes, dax_iomap_fault will handle splitting / fallback so that * we eventually come back with a COW page. */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pfn; if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(mapping); retry: handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); if (IS_ERR(handle)) { filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); return VM_FAULT_SIGBUS; } } else { filemap_invalidate_lock_shared(mapping); } result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); if (write) { ext4_journal_stop(handle); if ((result & VM_FAULT_ERROR) && error == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto retry; /* Handling synchronous page fault? */ if (result & VM_FAULT_NEEDDSYNC) result = dax_finish_sync_fault(vmf, order, pfn); filemap_invalidate_unlock_shared(mapping); sb_end_pagefault(sb); } else { filemap_invalidate_unlock_shared(mapping); } return result; } static vm_fault_t ext4_dax_fault(struct vm_fault *vmf) { return ext4_dax_huge_fault(vmf, 0); } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .huge_fault = ext4_dax_huge_fault, .page_mkwrite = ext4_dax_fault, .pfn_mkwrite = ext4_dax_fault, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; static int ext4_file_mmap_prepare(struct vm_area_desc *desc) { int ret; struct file *file = desc->file; struct inode *inode = file->f_mapping->host; struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; if (file->f_mode & FMODE_WRITE) ret = ext4_emergency_state(inode->i_sb); else ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; if (unlikely(ret)) return ret; /* * We don't support synchronous mappings for non-DAX files and * for DAX files if underneath dax_device is not synchronous. */ if (!daxdev_mapping_supported(desc->vm_flags, file_inode(file), dax_dev)) return -EOPNOTSUPP; file_accessed(file); if (IS_DAX(file_inode(file))) { desc->vm_ops = &ext4_dax_vm_ops; desc->vm_flags |= VM_HUGEPAGE; } else { desc->vm_ops = &ext4_file_vm_ops; } return 0; } static int ext4_sample_last_mounted(struct super_block *sb, struct vfsmount *mnt) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct path path; char buf[64], *cp; handle_t *handle; int err; if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; if (ext4_emergency_state(sb) || sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) return 0; ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); /* * Sample where the filesystem has been mounted and * store it in the superblock for sysadmin convenience * when trying to sort through large numbers of block * devices or filesystem images. */ memset(buf, 0, sizeof(buf)); path.mnt = mnt; path.dentry = mnt->mnt_root; cp = d_path(&path, buf, sizeof(buf)); err = 0; if (IS_ERR(cp)) goto out; handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); err = PTR_ERR(handle); if (IS_ERR(handle)) goto out; BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, sbi->s_sbh, EXT4_JTR_NONE); if (err) goto out_journal; lock_buffer(sbi->s_sbh); strtomem_pad(sbi->s_es->s_last_mounted, cp, 0); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); out_journal: ext4_journal_stop(handle); out: sb_end_intwrite(sb); return err; } static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; if (filp->f_mode & FMODE_WRITE) ret = ext4_emergency_state(inode->i_sb); else ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; if (unlikely(ret)) return ret; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); if (ret) return ret; ret = fscrypt_file_open(inode, filp); if (ret) return ret; ret = fsverity_file_open(inode, filp); if (ret) return ret; /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present */ if (filp->f_mode & FMODE_WRITE) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } if (ext4_inode_can_atomic_write(inode)) filp->f_mode |= FMODE_CAN_ATOMIC_WRITE; filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; return dquot_file_open(inode, filp); } /* * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values * by calling generic_file_llseek_size() with the appropriate maxbytes * value for each. */ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; loff_t maxbytes = ext4_get_maxbytes(inode); switch (whence) { default: return generic_file_llseek_size(file, offset, whence, maxbytes, i_size_read(inode)); case SEEK_HOLE: inode_lock_shared(inode); offset = iomap_seek_hole(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; case SEEK_DATA: inode_lock_shared(inode); offset = iomap_seek_data(inode, offset, &ext4_iomap_report_ops); inode_unlock_shared(inode); break; } if (offset < 0) return offset; return vfs_setpos(file, offset, maxbytes); } const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .iopoll = iocb_bio_iopoll, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif .mmap_prepare = ext4_file_mmap_prepare, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, .get_unmapped_area = thp_get_unmapped_area, .splice_read = ext4_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ext4_fallocate, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | FOP_DIO_PARALLEL_WRITE | FOP_DONTCACHE, }; const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_file_getattr, .listxattr = ext4_listxattr, .get_inode_acl = ext4_get_acl, .set_acl = ext4_set_acl, .fiemap = ext4_fiemap, .fileattr_get = ext4_fileattr_get, .fileattr_set = ext4_fileattr_set, };
1 35 133 7022 28 6972 27 2 2 13 9 4 4 11 4 13 5 8 2 1 4 5 9 6 2 20 17 6 7 7 11 4 7 4 32 197 261 1231 1272 32 115 197 15 17 80 81 15 69 17 17 17 17 87 14 7 8 1 11 95 70 16 84 81 3 85 95 77 22 11 40 145 145 95 736 736 607 735 735 736 652 1 735 1 735 736 607 274 653 1 734 606 735 1 6 6 6 4 8 8 3 3 133 135 10 4 3 2 2 81 65 37 72 35 19 15 19 11 1 3 8 1 1 3 1 6 51 4 35 12 19 19 19 64 64 45 45 45 64 21 50 10 10 21 53 64 11 11 3337 3342 742 742 70 29 49 9 45 3 1 2 2 11 11 150 1 1 148 4 3 1 1 7 1 6 19 1 1 17 85 22 4 1 56 1399 1 2 1394 93 677 807 807 2 2 1304 4490 4487 4487 323 74 253 22 2 5 15 8123 701 18 701 700 701 702 713 713 702 293 414 701 602 603 3225 3224 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 // SPDX-License-Identifier: GPL-2.0-only /* * fs/libfs.c * Library for filesystems writers. */ #include <linux/blkdev.h> #include <linux/export.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/cred.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/quotaops.h> #include <linux/mutex.h> #include <linux/namei.h> #include <linux/exportfs.h> #include <linux/iversion.h> #include <linux/writeback.h> #include <linux/buffer_head.h> /* sync_mapping_buffers */ #include <linux/fs_context.h> #include <linux/pseudo_fs.h> #include <linux/fsnotify.h> #include <linux/unicode.h> #include <linux/fscrypt.h> #include <linux/pidfs.h> #include <linux/uaccess.h> #include "internal.h" int simple_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } EXPORT_SYMBOL(simple_getattr); int simple_statfs(struct dentry *dentry, struct kstatfs *buf) { u64 id = huge_encode_dev(dentry->d_sb->s_dev); buf->f_fsid = u64_to_fsid(id); buf->f_type = dentry->d_sb->s_magic; buf->f_bsize = PAGE_SIZE; buf->f_namelen = NAME_MAX; return 0; } EXPORT_SYMBOL(simple_statfs); /* * Retaining negative dentries for an in-memory filesystem just wastes * memory and lookup time: arrange for them to be deleted immediately. */ int always_delete_dentry(const struct dentry *dentry) { return 1; } EXPORT_SYMBOL(always_delete_dentry); /* * Lookup the data. This is trivial - if the dentry didn't already * exist, we know it is negative. Set d_op to delete negative dentries. */ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); if (!dentry->d_op && !(dentry->d_flags & DCACHE_DONTCACHE)) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_DONTCACHE; spin_unlock(&dentry->d_lock); } if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) return NULL; d_add(dentry, NULL); return NULL; } EXPORT_SYMBOL(simple_lookup); int dcache_dir_open(struct inode *inode, struct file *file) { file->private_data = d_alloc_cursor(file->f_path.dentry); return file->private_data ? 0 : -ENOMEM; } EXPORT_SYMBOL(dcache_dir_open); int dcache_dir_close(struct inode *inode, struct file *file) { dput(file->private_data); return 0; } EXPORT_SYMBOL(dcache_dir_close); /* parent is locked at least shared */ /* * Returns an element of siblings' list. * We are looking for <count>th positive after <p>; if * found, dentry is grabbed and returned to caller. * If no such element exists, NULL is returned. */ static struct dentry *scan_positives(struct dentry *cursor, struct hlist_node **p, loff_t count, struct dentry *last) { struct dentry *dentry = cursor->d_parent, *found = NULL; spin_lock(&dentry->d_lock); while (*p) { struct dentry *d = hlist_entry(*p, struct dentry, d_sib); p = &d->d_sib.next; // we must at least skip cursors, to avoid livelocks if (d->d_flags & DCACHE_DENTRY_CURSOR) continue; if (simple_positive(d) && !--count) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) found = dget_dlock(d); spin_unlock(&d->d_lock); if (likely(found)) break; count = 1; } if (need_resched()) { if (!hlist_unhashed(&cursor->d_sib)) __hlist_del(&cursor->d_sib); hlist_add_behind(&cursor->d_sib, &d->d_sib); p = &cursor->d_sib.next; spin_unlock(&dentry->d_lock); cond_resched(); spin_lock(&dentry->d_lock); } } spin_unlock(&dentry->d_lock); dput(last); return found; } loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; switch (whence) { case 1: offset += file->f_pos; fallthrough; case 0: if (offset >= 0) break; fallthrough; default: return -EINVAL; } if (offset != file->f_pos) { struct dentry *cursor = file->private_data; struct dentry *to = NULL; inode_lock_shared(dentry->d_inode); if (offset > 2) to = scan_positives(cursor, &dentry->d_children.first, offset - 2, NULL); spin_lock(&dentry->d_lock); hlist_del_init(&cursor->d_sib); if (to) hlist_add_behind(&cursor->d_sib, &to->d_sib); spin_unlock(&dentry->d_lock); dput(to); file->f_pos = offset; inode_unlock_shared(dentry->d_inode); } return offset; } EXPORT_SYMBOL(dcache_dir_lseek); /* * Directory is locked and all positive dentries in it are safe, since * for ramfs-type trees they can't go away without unlink() or rmdir(), * both impossible due to the lock on directory. */ int dcache_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; struct dentry *cursor = file->private_data; struct dentry *next = NULL; struct hlist_node **p; if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos == 2) p = &dentry->d_children.first; else p = &cursor->d_sib.next; while ((next = scan_positives(cursor, p, 1, next)) != NULL) { if (!dir_emit(ctx, next->d_name.name, next->d_name.len, d_inode(next)->i_ino, fs_umode_to_dtype(d_inode(next)->i_mode))) break; ctx->pos++; p = &next->d_sib.next; } spin_lock(&dentry->d_lock); hlist_del_init(&cursor->d_sib); if (next) hlist_add_before(&cursor->d_sib, &next->d_sib); spin_unlock(&dentry->d_lock); dput(next); return 0; } EXPORT_SYMBOL(dcache_readdir); ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) { return -EISDIR; } EXPORT_SYMBOL(generic_read_dir); const struct file_operations simple_dir_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .llseek = dcache_dir_lseek, .read = generic_read_dir, .iterate_shared = dcache_readdir, .fsync = noop_fsync, }; EXPORT_SYMBOL(simple_dir_operations); const struct inode_operations simple_dir_inode_operations = { .lookup = simple_lookup, }; EXPORT_SYMBOL(simple_dir_inode_operations); /* simple_offset_add() never assigns these to a dentry */ enum { DIR_OFFSET_FIRST = 2, /* Find first real entry */ DIR_OFFSET_EOD = S32_MAX, }; /* simple_offset_add() allocation range */ enum { DIR_OFFSET_MIN = DIR_OFFSET_FIRST + 1, DIR_OFFSET_MAX = DIR_OFFSET_EOD - 1, }; static void offset_set(struct dentry *dentry, long offset) { dentry->d_fsdata = (void *)offset; } static long dentry2offset(struct dentry *dentry) { return (long)dentry->d_fsdata; } static struct lock_class_key simple_offset_lock_class; /** * simple_offset_init - initialize an offset_ctx * @octx: directory offset map to be initialized * */ void simple_offset_init(struct offset_ctx *octx) { mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE); lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class); octx->next_offset = DIR_OFFSET_MIN; } /** * simple_offset_add - Add an entry to a directory's offset map * @octx: directory offset ctx to be updated * @dentry: new dentry being added * * Returns zero on success. @octx and the dentry's offset are updated. * Otherwise, a negative errno value is returned. */ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) { unsigned long offset; int ret; if (dentry2offset(dentry) != 0) return -EBUSY; ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, DIR_OFFSET_MAX, &octx->next_offset, GFP_KERNEL); if (unlikely(ret < 0)) return ret == -EBUSY ? -ENOSPC : ret; offset_set(dentry, offset); return 0; } static int simple_offset_replace(struct offset_ctx *octx, struct dentry *dentry, long offset) { int ret; ret = mtree_store(&octx->mt, offset, dentry, GFP_KERNEL); if (ret) return ret; offset_set(dentry, offset); return 0; } /** * simple_offset_remove - Remove an entry to a directory's offset map * @octx: directory offset ctx to be updated * @dentry: dentry being removed * */ void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) { long offset; offset = dentry2offset(dentry); if (offset == 0) return; mtree_erase(&octx->mt, offset); offset_set(dentry, 0); } /** * simple_offset_rename - handle directory offsets for rename * @old_dir: parent directory of source entry * @old_dentry: dentry of source entry * @new_dir: parent_directory of destination entry * @new_dentry: dentry of destination * * Caller provides appropriate serialization. * * User space expects the directory offset value of the replaced * (new) directory entry to be unchanged after a rename. * * Returns zero on success, a negative errno value on failure. */ int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); long new_offset = dentry2offset(new_dentry); simple_offset_remove(old_ctx, old_dentry); if (new_offset) { offset_set(new_dentry, 0); return simple_offset_replace(new_ctx, old_dentry, new_offset); } return simple_offset_add(new_ctx, old_dentry); } /** * simple_offset_rename_exchange - exchange rename with directory offsets * @old_dir: parent of dentry being moved * @old_dentry: dentry being moved * @new_dir: destination parent * @new_dentry: destination dentry * * This API preserves the directory offset values. Caller provides * appropriate serialization. * * Returns zero on success. Otherwise a negative errno is returned and the * rename is rolled back. */ int simple_offset_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); long old_index = dentry2offset(old_dentry); long new_index = dentry2offset(new_dentry); int ret; simple_offset_remove(old_ctx, old_dentry); simple_offset_remove(new_ctx, new_dentry); ret = simple_offset_replace(new_ctx, old_dentry, new_index); if (ret) goto out_restore; ret = simple_offset_replace(old_ctx, new_dentry, old_index); if (ret) { simple_offset_remove(new_ctx, old_dentry); goto out_restore; } ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (ret) { simple_offset_remove(new_ctx, old_dentry); simple_offset_remove(old_ctx, new_dentry); goto out_restore; } return 0; out_restore: (void)simple_offset_replace(old_ctx, old_dentry, old_index); (void)simple_offset_replace(new_ctx, new_dentry, new_index); return ret; } /** * simple_offset_destroy - Release offset map * @octx: directory offset ctx that is about to be destroyed * * During fs teardown (eg. umount), a directory's offset map might still * contain entries. xa_destroy() cleans out anything that remains. */ void simple_offset_destroy(struct offset_ctx *octx) { mtree_destroy(&octx->mt); } /** * offset_dir_llseek - Advance the read position of a directory descriptor * @file: an open directory whose position is to be updated * @offset: a byte offset * @whence: enumerator describing the starting position for this update * * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories. * * Returns the updated read position if successful; otherwise a * negative errno is returned and the read position remains unchanged. */ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) { switch (whence) { case SEEK_CUR: offset += file->f_pos; fallthrough; case SEEK_SET: if (offset >= 0) break; fallthrough; default: return -EINVAL; } return vfs_setpos(file, offset, LONG_MAX); } static struct dentry *find_positive_dentry(struct dentry *parent, struct dentry *dentry, bool next) { struct dentry *found = NULL; spin_lock(&parent->d_lock); if (next) dentry = d_next_sibling(dentry); else if (!dentry) dentry = d_first_child(parent); hlist_for_each_entry_from(dentry, d_sib) { if (!simple_positive(dentry)) continue; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(dentry)) found = dget_dlock(dentry); spin_unlock(&dentry->d_lock); if (likely(found)) break; } spin_unlock(&parent->d_lock); return found; } static noinline_for_stack struct dentry * offset_dir_lookup(struct dentry *parent, loff_t offset) { struct inode *inode = d_inode(parent); struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); struct dentry *child, *found = NULL; MA_STATE(mas, &octx->mt, offset, offset); if (offset == DIR_OFFSET_FIRST) found = find_positive_dentry(parent, NULL, false); else { rcu_read_lock(); child = mas_find_rev(&mas, DIR_OFFSET_MIN); found = find_positive_dentry(parent, child, false); rcu_read_unlock(); } return found; } static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) { struct inode *inode = d_inode(dentry); return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, inode->i_ino, fs_umode_to_dtype(inode->i_mode)); } static void offset_iterate_dir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; struct dentry *dentry; dentry = offset_dir_lookup(dir, ctx->pos); if (!dentry) goto out_eod; while (true) { struct dentry *next; ctx->pos = dentry2offset(dentry); if (!offset_dir_emit(ctx, dentry)) break; next = find_positive_dentry(dir, dentry, true); dput(dentry); if (!next) goto out_eod; dentry = next; } dput(dentry); return; out_eod: ctx->pos = DIR_OFFSET_EOD; } /** * offset_readdir - Emit entries starting at offset @ctx->pos * @file: an open directory to iterate over * @ctx: directory iteration context * * Caller must hold @file's i_rwsem to prevent insertion or removal of * entries during this call. * * On entry, @ctx->pos contains an offset that represents the first entry * to be read from the directory. * * The operation continues until there are no more entries to read, or * until the ctx->actor indicates there is no more space in the caller's * output buffer. * * On return, @ctx->pos contains an offset that will read the next entry * in this directory when offset_readdir() is called again with @ctx. * Caller places this value in the d_off field of the last entry in the * user's buffer. * * Return values: * %0 - Complete */ static int offset_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dir = file->f_path.dentry; lockdep_assert_held(&d_inode(dir)->i_rwsem); if (!dir_emit_dots(file, ctx)) return 0; if (ctx->pos != DIR_OFFSET_EOD) offset_iterate_dir(file, ctx); return 0; } const struct file_operations simple_offset_dir_operations = { .llseek = offset_dir_llseek, .iterate_shared = offset_readdir, .read = generic_read_dir, .fsync = noop_fsync, }; struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { struct dentry *child = NULL, *d; spin_lock(&parent->d_lock); d = prev ? d_next_sibling(prev) : d_first_child(parent); hlist_for_each_entry_from(d, d_sib) { if (simple_positive(d)) { spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(d)) child = dget_dlock(d); spin_unlock(&d->d_lock); if (likely(child)) break; } } spin_unlock(&parent->d_lock); dput(prev); return child; } EXPORT_SYMBOL(find_next_child); static void __simple_recursive_removal(struct dentry *dentry, void (*callback)(struct dentry *), bool locked) { struct dentry *this = dget(dentry); while (true) { struct dentry *victim = NULL, *child; struct inode *inode = this->d_inode; inode_lock_nested(inode, I_MUTEX_CHILD); if (d_is_dir(this)) inode->i_flags |= S_DEAD; while ((child = find_next_child(this, victim)) == NULL) { // kill and ascend // update metadata while it's still locked inode_set_ctime_current(inode); clear_nlink(inode); inode_unlock(inode); victim = this; this = this->d_parent; inode = this->d_inode; if (!locked || victim != dentry) inode_lock_nested(inode, I_MUTEX_CHILD); if (simple_positive(victim)) { d_invalidate(victim); // avoid lost mounts if (callback) callback(victim); fsnotify_delete(inode, d_inode(victim), victim); dput(victim); // unpin it } if (victim == dentry) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (d_is_dir(dentry)) drop_nlink(inode); if (!locked) inode_unlock(inode); dput(dentry); return; } } inode_unlock(inode); this = child; } } void simple_recursive_removal(struct dentry *dentry, void (*callback)(struct dentry *)) { return __simple_recursive_removal(dentry, callback, false); } EXPORT_SYMBOL(simple_recursive_removal); /* caller holds parent directory with I_MUTEX_PARENT */ void locked_recursive_removal(struct dentry *dentry, void (*callback)(struct dentry *)) { return __simple_recursive_removal(dentry, callback, true); } EXPORT_SYMBOL(locked_recursive_removal); static const struct super_operations simple_super_operations = { .statfs = simple_statfs, }; static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) { struct pseudo_fs_context *ctx = fc->fs_private; struct inode *root; s->s_maxbytes = MAX_LFS_FILESIZE; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = ctx->magic; s->s_op = ctx->ops ?: &simple_super_operations; s->s_export_op = ctx->eops; s->s_xattr = ctx->xattr; s->s_time_gran = 1; root = new_inode(s); if (!root) return -ENOMEM; /* * since this is the first inode, make it number 1. New inodes created * after this must take care not to collide with it (by passing * max_reserved of 1 to iunique). */ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; simple_inode_init_ts(root); s->s_root = d_make_root(root); if (!s->s_root) return -ENOMEM; set_default_d_op(s, ctx->dops); return 0; } static int pseudo_fs_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, pseudo_fs_fill_super); } static void pseudo_fs_free(struct fs_context *fc) { kfree(fc->fs_private); } static const struct fs_context_operations pseudo_fs_context_ops = { .free = pseudo_fs_free, .get_tree = pseudo_fs_get_tree, }; /* * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that * will never be mountable) */ struct pseudo_fs_context *init_pseudo(struct fs_context *fc, unsigned long magic) { struct pseudo_fs_context *ctx; ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL); if (likely(ctx)) { ctx->magic = magic; fc->fs_private = ctx; fc->ops = &pseudo_fs_context_ops; fc->sb_flags |= SB_NOUSER; fc->global = true; } return ctx; } EXPORT_SYMBOL(init_pseudo); int simple_open(struct inode *inode, struct file *file) { if (inode->i_private) file->private_data = inode->i_private; return 0; } EXPORT_SYMBOL(simple_open); int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(old_dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inc_nlink(inode); ihold(inode); dget(dentry); d_instantiate(dentry, inode); return 0; } EXPORT_SYMBOL(simple_link); int simple_empty(struct dentry *dentry) { struct dentry *child; int ret = 0; spin_lock(&dentry->d_lock); hlist_for_each_entry(child, &dentry->d_children, d_sib) { spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (simple_positive(child)) { spin_unlock(&child->d_lock); goto out; } spin_unlock(&child->d_lock); } ret = 1; out: spin_unlock(&dentry->d_lock); return ret; } EXPORT_SYMBOL(simple_empty); int simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); drop_nlink(inode); dput(dentry); return 0; } EXPORT_SYMBOL(simple_unlink); int simple_rmdir(struct inode *dir, struct dentry *dentry) { if (!simple_empty(dentry)) return -ENOTEMPTY; drop_nlink(d_inode(dentry)); simple_unlink(dir, dentry); drop_nlink(dir); return 0; } EXPORT_SYMBOL(simple_rmdir); /** * simple_rename_timestamp - update the various inode timestamps for rename * @old_dir: old parent directory * @old_dentry: dentry that is being renamed * @new_dir: new parent directory * @new_dentry: target for rename * * POSIX mandates that the old and new parent directories have their ctime and * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have * their ctime updated. */ void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct inode *newino = d_inode(new_dentry); inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); if (new_dir != old_dir) inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir)); inode_set_ctime_current(d_inode(old_dentry)); if (newino) inode_set_ctime_current(newino); } EXPORT_SYMBOL_GPL(simple_rename_timestamp); int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { bool old_is_dir = d_is_dir(old_dentry); bool new_is_dir = d_is_dir(new_dentry); if (old_dir != new_dir && old_is_dir != new_is_dir) { if (old_is_dir) { drop_nlink(old_dir); inc_nlink(new_dir); } else { drop_nlink(new_dir); inc_nlink(old_dir); } } simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL_GPL(simple_rename_exchange); int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int they_are_dirs = d_is_dir(old_dentry); if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) return -EINVAL; if (flags & RENAME_EXCHANGE) return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; if (d_really_is_positive(new_dentry)) { simple_unlink(new_dir, new_dentry); if (they_are_dirs) { drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { drop_nlink(old_dir); inc_nlink(new_dir); } simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL(simple_rename); /** * simple_setattr - setattr for simple filesystem * @idmap: idmap of the target mount * @dentry: dentry * @iattr: iattr structure * * Returns 0 on success, -error on failure. * * simple_setattr is a simple ->setattr implementation without a proper * implementation of size changes. * * It can either be used for in-memory filesystems or special files * on simple regular filesystems. Anything that needs to change on-disk * or wire state on size changes needs its own setattr method. */ int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; error = setattr_prepare(idmap, dentry, iattr); if (error) return error; if (iattr->ia_valid & ATTR_SIZE) truncate_setsize(inode, iattr->ia_size); setattr_copy(idmap, inode, iattr); mark_inode_dirty(inode); return 0; } EXPORT_SYMBOL(simple_setattr); static int simple_read_folio(struct file *file, struct folio *folio) { folio_zero_range(folio, 0, folio_size(folio)); flush_dcache_folio(folio); folio_mark_uptodate(folio); folio_unlock(folio); return 0; } int simple_write_begin(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { struct folio *folio; folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); *foliop = folio; if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { size_t from = offset_in_folio(folio, pos); folio_zero_segments(folio, 0, from, from + len, folio_size(folio)); } return 0; } EXPORT_SYMBOL(simple_write_begin); /** * simple_write_end - .write_end helper for non-block-device FSes * @iocb: kernel I/O control block * @mapping: " * @pos: " * @len: " * @copied: " * @folio: " * @fsdata: " * * simple_write_end does the minimum needed for updating a folio after * writing is done. It has the same API signature as the .write_end of * address_space_operations vector. So it can just be set onto .write_end for * FSes that don't need any other processing. i_rwsem is assumed to be held * exclusively. * Block based filesystems should use generic_write_end(). * NOTE: Even though i_size might get updated by this function, mark_inode_dirty * is not called, so a filesystem that actually does store data in .write_inode * should extend on what's done here with a call to mark_inode_dirty() in the * case that i_size has changed. * * Use *ONLY* with simple_read_folio() */ static int simple_write_end(const struct kiocb *iocb, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct folio *folio, void *fsdata) { struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; /* zero the stale part of the folio if we did a short copy */ if (!folio_test_uptodate(folio)) { if (copied < len) { size_t from = offset_in_folio(folio, pos); folio_zero_range(folio, from + copied, len - copied); } folio_mark_uptodate(folio); } /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold the i_rwsem. */ if (last_pos > inode->i_size) i_size_write(inode, last_pos); folio_mark_dirty(folio); folio_unlock(folio); folio_put(folio); return copied; } /* * Provides ramfs-style behavior: data in the pagecache, but no writeback. */ const struct address_space_operations ram_aops = { .read_folio = simple_read_folio, .write_begin = simple_write_begin, .write_end = simple_write_end, .dirty_folio = noop_dirty_folio, }; EXPORT_SYMBOL(ram_aops); /* * the inodes created here are not hashed. If you use iunique to generate * unique inode values later for this filesystem, then you must take care * to pass it an appropriate max_reserved value to avoid collisions. */ int simple_fill_super(struct super_block *s, unsigned long magic, const struct tree_descr *files) { struct inode *inode; struct dentry *dentry; int i; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = magic; s->s_op = &simple_super_operations; s->s_time_gran = 1; inode = new_inode(s); if (!inode) return -ENOMEM; /* * because the root inode is 1, the files array must not contain an * entry at index 1 */ inode->i_ino = 1; inode->i_mode = S_IFDIR | 0755; simple_inode_init_ts(inode); inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); s->s_root = d_make_root(inode); if (!s->s_root) return -ENOMEM; for (i = 0; !files->name || files->name[0]; i++, files++) { if (!files->name) continue; /* warn if it tries to conflict with the root inode */ if (unlikely(i == 1)) printk(KERN_WARNING "%s: %s passed in a files array" "with an index of 1!\n", __func__, s->s_type->name); dentry = d_alloc_name(s->s_root, files->name); if (!dentry) return -ENOMEM; inode = new_inode(s); if (!inode) { dput(dentry); return -ENOMEM; } inode->i_mode = S_IFREG | files->mode; simple_inode_init_ts(inode); inode->i_fop = files->ops; inode->i_ino = i; d_add(dentry, inode); } return 0; } EXPORT_SYMBOL(simple_fill_super); static DEFINE_SPINLOCK(pin_fs_lock); int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) { struct vfsmount *mnt = NULL; spin_lock(&pin_fs_lock); if (unlikely(!*mount)) { spin_unlock(&pin_fs_lock); mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (IS_ERR(mnt)) return PTR_ERR(mnt); spin_lock(&pin_fs_lock); if (!*mount) *mount = mnt; } mntget(*mount); ++*count; spin_unlock(&pin_fs_lock); mntput(mnt); return 0; } EXPORT_SYMBOL(simple_pin_fs); void simple_release_fs(struct vfsmount **mount, int *count) { struct vfsmount *mnt; spin_lock(&pin_fs_lock); mnt = *mount; if (!--*count) *mount = NULL; spin_unlock(&pin_fs_lock); mntput(mnt); } EXPORT_SYMBOL(simple_release_fs); /** * simple_read_from_buffer - copy data from the buffer to user space * @to: the user space buffer to read to * @count: the maximum number of bytes to read * @ppos: the current position in the buffer * @from: the buffer to read from * @available: the size of the buffer * * The simple_read_from_buffer() function reads up to @count bytes from the * buffer @from at offset @ppos into the user space address starting at @to. * * On success, the number of bytes read is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, const void *from, size_t available) { loff_t pos = *ppos; size_t ret; if (pos < 0) return -EINVAL; if (pos >= available || !count) return 0; if (count > available - pos) count = available - pos; ret = copy_to_user(to, from + pos, count); if (ret == count) return -EFAULT; count -= ret; *ppos = pos + count; return count; } EXPORT_SYMBOL(simple_read_from_buffer); /** * simple_write_to_buffer - copy data from user space to the buffer * @to: the buffer to write to * @available: the size of the buffer * @ppos: the current position in the buffer * @from: the user space buffer to read from * @count: the maximum number of bytes to read * * The simple_write_to_buffer() function reads up to @count bytes from the user * space address starting at @from into the buffer @to at offset @ppos. * * On success, the number of bytes written is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count) { loff_t pos = *ppos; size_t res; if (pos < 0) return -EINVAL; if (pos >= available || !count) return 0; if (count > available - pos) count = available - pos; res = copy_from_user(to + pos, from, count); if (res == count) return -EFAULT; count -= res; *ppos = pos + count; return count; } EXPORT_SYMBOL(simple_write_to_buffer); /** * memory_read_from_buffer - copy data from the buffer * @to: the kernel space buffer to read to * @count: the maximum number of bytes to read * @ppos: the current position in the buffer * @from: the buffer to read from * @available: the size of the buffer * * The memory_read_from_buffer() function reads up to @count bytes from the * buffer @from at offset @ppos into the kernel space address starting at @to. * * On success, the number of bytes read is returned and the offset @ppos is * advanced by this number, or negative value is returned on error. **/ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, const void *from, size_t available) { loff_t pos = *ppos; if (pos < 0) return -EINVAL; if (pos >= available) return 0; if (count > available - pos) count = available - pos; memcpy(to, from + pos, count); *ppos = pos + count; return count; } EXPORT_SYMBOL(memory_read_from_buffer); /* * Transaction based IO. * The file expects a single write which triggers the transaction, and then * possibly a read which collects the result - which is stored in a * file-local buffer. */ void simple_transaction_set(struct file *file, size_t n) { struct simple_transaction_argresp *ar = file->private_data; BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); /* * The barrier ensures that ar->size will really remain zero until * ar->data is ready for reading. */ smp_mb(); ar->size = n; } EXPORT_SYMBOL(simple_transaction_set); char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) { struct simple_transaction_argresp *ar; static DEFINE_SPINLOCK(simple_transaction_lock); if (size > SIMPLE_TRANSACTION_LIMIT - 1) return ERR_PTR(-EFBIG); ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL); if (!ar) return ERR_PTR(-ENOMEM); spin_lock(&simple_transaction_lock); /* only one write allowed per open */ if (file->private_data) { spin_unlock(&simple_transaction_lock); free_page((unsigned long)ar); return ERR_PTR(-EBUSY); } file->private_data = ar; spin_unlock(&simple_transaction_lock); if (copy_from_user(ar->data, buf, size)) return ERR_PTR(-EFAULT); return ar->data; } EXPORT_SYMBOL(simple_transaction_get); ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) { struct simple_transaction_argresp *ar = file->private_data; if (!ar) return 0; return simple_read_from_buffer(buf, size, pos, ar->data, ar->size); } EXPORT_SYMBOL(simple_transaction_read); int simple_transaction_release(struct inode *inode, struct file *file) { free_page((unsigned long)file->private_data); return 0; } EXPORT_SYMBOL(simple_transaction_release); /* Simple attribute files */ struct simple_attr { int (*get)(void *, u64 *); int (*set)(void *, u64); char get_buf[24]; /* enough to store a u64 and "\n\0" */ char set_buf[24]; void *data; const char *fmt; /* format for read operation */ struct mutex mutex; /* protects access to these buffers */ }; /* simple_attr_open is called by an actual attribute open file operation * to set the attribute specific access operations. */ int simple_attr_open(struct inode *inode, struct file *file, int (*get)(void *, u64 *), int (*set)(void *, u64), const char *fmt) { struct simple_attr *attr; attr = kzalloc(sizeof(*attr), GFP_KERNEL); if (!attr) return -ENOMEM; attr->get = get; attr->set = set; attr->data = inode->i_private; attr->fmt = fmt; mutex_init(&attr->mutex); file->private_data = attr; return nonseekable_open(inode, file); } EXPORT_SYMBOL_GPL(simple_attr_open); int simple_attr_release(struct inode *inode, struct file *file) { kfree(file->private_data); return 0; } EXPORT_SYMBOL_GPL(simple_attr_release); /* GPL-only? This? Really? */ /* read from the buffer that is filled with the get function */ ssize_t simple_attr_read(struct file *file, char __user *buf, size_t len, loff_t *ppos) { struct simple_attr *attr; size_t size; ssize_t ret; attr = file->private_data; if (!attr->get) return -EACCES; ret = mutex_lock_interruptible(&attr->mutex); if (ret) return ret; if (*ppos && attr->get_buf[0]) { /* continued read */ size = strlen(attr->get_buf); } else { /* first read */ u64 val; ret = attr->get(attr->data, &val); if (ret) goto out; size = scnprintf(attr->get_buf, sizeof(attr->get_buf), attr->fmt, (unsigned long long)val); } ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); out: mutex_unlock(&attr->mutex); return ret; } EXPORT_SYMBOL_GPL(simple_attr_read); /* interpret the buffer as a number to call the set function with */ static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, size_t len, loff_t *ppos, bool is_signed) { struct simple_attr *attr; unsigned long long val; size_t size; ssize_t ret; attr = file->private_data; if (!attr->set) return -EACCES; ret = mutex_lock_interruptible(&attr->mutex); if (ret) return ret; ret = -EFAULT; size = min(sizeof(attr->set_buf) - 1, len); if (copy_from_user(attr->set_buf, buf, size)) goto out; attr->set_buf[size] = '\0'; if (is_signed) ret = kstrtoll(attr->set_buf, 0, &val); else ret = kstrtoull(attr->set_buf, 0, &val); if (ret) goto out; ret = attr->set(attr->data, val); if (ret == 0) ret = len; /* on success, claim we got the whole input */ out: mutex_unlock(&attr->mutex); return ret; } ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return simple_attr_write_xsigned(file, buf, len, ppos, false); } EXPORT_SYMBOL_GPL(simple_attr_write); ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return simple_attr_write_xsigned(file, buf, len, ppos, true); } EXPORT_SYMBOL_GPL(simple_attr_write_signed); /** * generic_encode_ino32_fh - generic export_operations->encode_fh function * @inode: the object to encode * @fh: where to store the file handle fragment * @max_len: maximum length to store there (in 4 byte units) * @parent: parent directory inode, if wanted * * This generic encode_fh function assumes that the 32 inode number * is suitable for locating an inode, and that the generation number * can be used to check that it is still valid. It places them in the * filehandle fragment where export_decode_fh expects to find them. */ int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { struct fid *fid = (void *)fh; int len = *max_len; int type = FILEID_INO32_GEN; if (parent && (len < 4)) { *max_len = 4; return FILEID_INVALID; } else if (len < 2) { *max_len = 2; return FILEID_INVALID; } len = 2; fid->i32.ino = inode->i_ino; fid->i32.gen = inode->i_generation; if (parent) { fid->i32.parent_ino = parent->i_ino; fid->i32.parent_gen = parent->i_generation; len = 4; type = FILEID_INO32_GEN_PARENT; } *max_len = len; return type; } EXPORT_SYMBOL_GPL(generic_encode_ino32_fh); /** * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert * @fh_len: length of the file handle in bytes * @fh_type: type of file handle * @get_inode: filesystem callback to retrieve inode * * This function decodes @fid as long as it has one of the well-known * Linux filehandle types and calls @get_inode on it to retrieve the * inode for the object specified in the file handle. */ struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)) { struct inode *inode = NULL; if (fh_len < 2) return NULL; switch (fh_type) { case FILEID_INO32_GEN: case FILEID_INO32_GEN_PARENT: inode = get_inode(sb, fid->i32.ino, fid->i32.gen); break; } return d_obtain_alias(inode); } EXPORT_SYMBOL_GPL(generic_fh_to_dentry); /** * generic_fh_to_parent - generic helper for the fh_to_parent export operation * @sb: filesystem to do the file handle conversion on * @fid: file handle to convert * @fh_len: length of the file handle in bytes * @fh_type: type of file handle * @get_inode: filesystem callback to retrieve inode * * This function decodes @fid as long as it has one of the well-known * Linux filehandle types and calls @get_inode on it to retrieve the * inode for the _parent_ object specified in the file handle if it * is specified in the file handle, or NULL otherwise. */ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type, struct inode *(*get_inode) (struct super_block *sb, u64 ino, u32 gen)) { struct inode *inode = NULL; if (fh_len <= 2) return NULL; switch (fh_type) { case FILEID_INO32_GEN_PARENT: inode = get_inode(sb, fid->i32.parent_ino, (fh_len > 3 ? fid->i32.parent_gen : 0)); break; } return d_obtain_alias(inode); } EXPORT_SYMBOL_GPL(generic_fh_to_parent); /** * __generic_file_fsync - generic fsync implementation for simple filesystems * * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * * This is a generic implementation of the fsync method for simple * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure. */ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int err; int ret; err = file_write_and_wait_range(file, start, end); if (err) return err; inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY_ALL)) goto out; if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) goto out; err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; out: inode_unlock(inode); /* check and advance again to catch errors after syncing out buffers */ err = file_check_and_advance_wb_err(file); if (ret == 0) ret = err; return ret; } EXPORT_SYMBOL(__generic_file_fsync); /** * generic_file_fsync - generic fsync implementation for simple filesystems * with flush * @file: file to synchronize * @start: start offset in bytes * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * */ int generic_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; int err; err = __generic_file_fsync(file, start, end, datasync); if (err) return err; return blkdev_issue_flush(inode->i_sb->s_bdev); } EXPORT_SYMBOL(generic_file_fsync); /** * generic_check_addressable - Check addressability of file system * @blocksize_bits: log of file system block size * @num_blocks: number of blocks in file system * * Determine whether a file system with @num_blocks blocks (and a * block size of 2**@blocksize_bits) is addressable by the sector_t * and page cache of the system. Return 0 if so and -EFBIG otherwise. */ int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) { u64 last_fs_block = num_blocks - 1; u64 last_fs_page, max_bytes; if (check_shl_overflow(num_blocks, blocksize_bits, &max_bytes)) return -EFBIG; last_fs_page = (max_bytes >> PAGE_SHIFT) - 1; if (unlikely(num_blocks == 0)) return 0; if (blocksize_bits < 9) return -EINVAL; if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || (last_fs_page > (pgoff_t)(~0ULL))) { return -EFBIG; } return 0; } EXPORT_SYMBOL(generic_check_addressable); /* * No-op implementation of ->fsync for in-memory filesystems. */ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) { return 0; } EXPORT_SYMBOL(noop_fsync); ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { /* * iomap based filesystems support direct I/O without need for * this callback. However, it still needs to be set in * inode->a_ops so that open/fcntl know that direct I/O is * generally supported. */ return -EINVAL; } EXPORT_SYMBOL_GPL(noop_direct_IO); /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ void kfree_link(void *p) { kfree(p); } EXPORT_SYMBOL(kfree_link); struct inode *alloc_anon_inode(struct super_block *s) { static const struct address_space_operations anon_aops = { .dirty_folio = noop_dirty_folio, }; struct inode *inode = new_inode_pseudo(s); if (!inode) return ERR_PTR(-ENOMEM); inode->i_ino = get_next_ino(); inode->i_mapping->a_ops = &anon_aops; /* * Mark the inode dirty from the very beginning, * that way it will never be moved to the dirty * list because mark_inode_dirty() will think * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; /* * Historically anonymous inodes don't have a type at all and * userspace has come to rely on this. */ inode->i_mode = S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE | S_ANON_INODE; simple_inode_init_ts(inode); return inode; } EXPORT_SYMBOL(alloc_anon_inode); /** * simple_nosetlease - generic helper for prohibiting leases * @filp: file pointer * @arg: type of lease to obtain * @flp: new lease supplied for insertion * @priv: private data for lm_setup operation * * Generic helper for filesystems that do not wish to allow leases to be set. * All arguments are ignored and it just returns -EINVAL. */ int simple_nosetlease(struct file *filp, int arg, struct file_lease **flp, void **priv) { return -EINVAL; } EXPORT_SYMBOL(simple_nosetlease); /** * simple_get_link - generic helper to get the target of "fast" symlinks * @dentry: not used here * @inode: the symlink inode * @done: not used here * * Generic helper for filesystems to use for symlink inodes where a pointer to * the symlink target is stored in ->i_link. NOTE: this isn't normally called, * since as an optimization the path lookup code uses any non-NULL ->i_link * directly, without calling ->get_link(). But ->get_link() still must be set, * to mark the inode_operations as being for a symlink. * * Return: the symlink target */ const char *simple_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { return inode->i_link; } EXPORT_SYMBOL(simple_get_link); const struct inode_operations simple_symlink_inode_operations = { .get_link = simple_get_link, }; EXPORT_SYMBOL(simple_symlink_inode_operations); /* * Operations for a permanently empty directory. */ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { return ERR_PTR(-ENOENT); } static int empty_dir_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { return -EPERM; } static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) { return -EOPNOTSUPP; } static const struct inode_operations empty_dir_inode_operations = { .lookup = empty_dir_lookup, .setattr = empty_dir_setattr, .listxattr = empty_dir_listxattr, }; static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) { /* An empty directory has two entries . and .. at offsets 0 and 1 */ return generic_file_llseek_size(file, offset, whence, 2, 2); } static int empty_dir_readdir(struct file *file, struct dir_context *ctx) { dir_emit_dots(file, ctx); return 0; } static const struct file_operations empty_dir_operations = { .llseek = empty_dir_llseek, .read = generic_read_dir, .iterate_shared = empty_dir_readdir, .fsync = noop_fsync, }; void make_empty_dir_inode(struct inode *inode) { set_nlink(inode, 2); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_rdev = 0; inode->i_size = 0; inode->i_blkbits = PAGE_SHIFT; inode->i_blocks = 0; inode->i_op = &empty_dir_inode_operations; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &empty_dir_operations; } bool is_empty_dir_inode(struct inode *inode) { return (inode->i_fop == &empty_dir_operations) && (inode->i_op == &empty_dir_inode_operations); } #if IS_ENABLED(CONFIG_UNICODE) /** * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems * @dentry: dentry whose name we are checking against * @len: len of name of dentry * @str: str pointer to name of dentry * @name: Name to compare against * * Return: 0 if names match, 1 if mismatch, or -ERRNO */ int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, const char *str, const struct qstr *name) { const struct dentry *parent; const struct inode *dir; union shortname_store strbuf; struct qstr qstr; /* * Attempt a case-sensitive match first. It is cheaper and * should cover most lookups, including all the sane * applications that expect a case-sensitive filesystem. * * This comparison is safe under RCU because the caller * guarantees the consistency between str and len. See * __d_lookup_rcu_op_compare() for details. */ if (len == name->len && !memcmp(str, name->name, len)) return 0; parent = READ_ONCE(dentry->d_parent); dir = READ_ONCE(parent->d_inode); if (!dir || !IS_CASEFOLDED(dir)) return 1; qstr.len = len; qstr.name = str; /* * If the dentry name is stored in-line, then it may be concurrently * modified by a rename. If this happens, the VFS will eventually retry * the lookup, so it doesn't matter what ->d_compare() returns. * However, it's unsafe to call utf8_strncasecmp() with an unstable * string. Therefore, we have to copy the name into a temporary buffer. * As above, len is guaranteed to match str, so the shortname case * is exactly when str points to ->d_shortname. */ if (qstr.name == dentry->d_shortname.string) { strbuf = dentry->d_shortname; // NUL is guaranteed to be in there qstr.name = strbuf.string; /* prevent compiler from optimizing out the temporary buffer */ barrier(); } return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); } EXPORT_SYMBOL(generic_ci_d_compare); /** * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems * @dentry: dentry of the parent directory * @str: qstr of name whose hash we should fill in * * Return: 0 if hash was successful or unchanged, and -EINVAL on error */ int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) { const struct inode *dir = READ_ONCE(dentry->d_inode); struct super_block *sb = dentry->d_sb; const struct unicode_map *um = sb->s_encoding; int ret; if (!dir || !IS_CASEFOLDED(dir)) return 0; ret = utf8_casefold_hash(um, dentry, str); if (ret < 0 && sb_has_strict_encoding(sb)) return -EINVAL; return 0; } EXPORT_SYMBOL(generic_ci_d_hash); static const struct dentry_operations generic_ci_dentry_ops = { .d_hash = generic_ci_d_hash, .d_compare = generic_ci_d_compare, #ifdef CONFIG_FS_ENCRYPTION .d_revalidate = fscrypt_d_revalidate, #endif }; /** * generic_ci_match() - Match a name (case-insensitively) with a dirent. * This is a filesystem helper for comparison with directory entries. * generic_ci_d_compare should be used in VFS' ->d_compare instead. * * @parent: Inode of the parent of the dirent under comparison * @name: name under lookup. * @folded_name: Optional pre-folded name under lookup * @de_name: Dirent name. * @de_name_len: dirent name length. * * Test whether a case-insensitive directory entry matches the filename * being searched. If @folded_name is provided, it is used instead of * recalculating the casefold of @name. * * Return: > 0 if the directory entry matches, 0 if it doesn't match, or * < 0 on error. */ int generic_ci_match(const struct inode *parent, const struct qstr *name, const struct qstr *folded_name, const u8 *de_name, u32 de_name_len) { const struct super_block *sb = parent->i_sb; const struct unicode_map *um = sb->s_encoding; struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); struct qstr dirent = QSTR_INIT(de_name, de_name_len); int res = 0; if (IS_ENCRYPTED(parent)) { const struct fscrypt_str encrypted_name = FSTR_INIT((u8 *) de_name, de_name_len); if (WARN_ON_ONCE(!fscrypt_has_encryption_key(parent))) return -EINVAL; decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); if (!decrypted_name.name) return -ENOMEM; res = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, &decrypted_name); if (res < 0) { kfree(decrypted_name.name); return res; } dirent.name = decrypted_name.name; dirent.len = decrypted_name.len; } /* * Attempt a case-sensitive match first. It is cheaper and * should cover most lookups, including all the sane * applications that expect a case-sensitive filesystem. */ if (dirent.len == name->len && !memcmp(name->name, dirent.name, dirent.len)) goto out; if (folded_name->name) res = utf8_strncasecmp_folded(um, folded_name, &dirent); else res = utf8_strncasecmp(um, name, &dirent); out: kfree(decrypted_name.name); if (res < 0 && sb_has_strict_encoding(sb)) { pr_err_ratelimited("Directory contains filename that is invalid UTF-8"); return 0; } return !res; } EXPORT_SYMBOL(generic_ci_match); #endif #ifdef CONFIG_FS_ENCRYPTION static const struct dentry_operations generic_encrypted_dentry_ops = { .d_revalidate = fscrypt_d_revalidate, }; #endif /** * generic_set_sb_d_ops - helper for choosing the set of * filesystem-wide dentry operations for the enabled features * @sb: superblock to be configured * * Filesystems supporting casefolding and/or fscrypt can call this * helper at mount-time to configure default dentry_operations to the * best set of dentry operations required for the enabled features. * The helper must be called after these have been configured, but * before the root dentry is created. */ void generic_set_sb_d_ops(struct super_block *sb) { #if IS_ENABLED(CONFIG_UNICODE) if (sb->s_encoding) { set_default_d_op(sb, &generic_ci_dentry_ops); return; } #endif #ifdef CONFIG_FS_ENCRYPTION if (sb->s_cop) { set_default_d_op(sb, &generic_encrypted_dentry_ops); return; } #endif } EXPORT_SYMBOL(generic_set_sb_d_ops); /** * inode_maybe_inc_iversion - increments i_version * @inode: inode with the i_version that should be updated * @force: increment the counter even if it's not necessary? * * Every time the inode is modified, the i_version field must be seen to have * changed by any observer. * * If "force" is set or the QUERIED flag is set, then ensure that we increment * the value, and clear the queried flag. * * In the common case where neither is set, then we can return "false" without * updating i_version. * * If this function returns false, and no other metadata has changed, then we * can avoid logging the metadata. */ bool inode_maybe_inc_iversion(struct inode *inode, bool force) { u64 cur, new; /* * The i_version field is not strictly ordered with any other inode * information, but the legacy inode_inc_iversion code used a spinlock * to serialize increments. * * We add a full memory barrier to ensure that any de facto ordering * with other state is preserved (either implicitly coming from cmpxchg * or explicitly from smp_mb if we don't know upfront if we will execute * the former). * * These barriers pair with inode_query_iversion(). */ cur = inode_peek_iversion_raw(inode); if (!force && !(cur & I_VERSION_QUERIED)) { smp_mb(); cur = inode_peek_iversion_raw(inode); } do { /* If flag is clear then we needn't do anything */ if (!force && !(cur & I_VERSION_QUERIED)) return false; /* Since lowest bit is flag, add 2 to avoid it */ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return true; } EXPORT_SYMBOL(inode_maybe_inc_iversion); /** * inode_query_iversion - read i_version for later use * @inode: inode from which i_version should be read * * Read the inode i_version counter. This should be used by callers that wish * to store the returned i_version for later comparison. This will guarantee * that a later query of the i_version will result in a different value if * anything has changed. * * In this implementation, we fetch the current value, set the QUERIED flag and * then try to swap it into place with a cmpxchg, if it wasn't already set. If * that fails, we try again with the newly fetched value from the cmpxchg. */ u64 inode_query_iversion(struct inode *inode) { u64 cur, new; bool fenced = false; /* * Memory barriers (implicit in cmpxchg, explicit in smp_mb) pair with * inode_maybe_inc_iversion(), see that routine for more details. */ cur = inode_peek_iversion_raw(inode); do { /* If flag is already set, then no need to swap */ if (cur & I_VERSION_QUERIED) { if (!fenced) smp_mb(); break; } fenced = true; new = cur | I_VERSION_QUERIED; } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); return cur >> I_VERSION_QUERIED_SHIFT; } EXPORT_SYMBOL(inode_query_iversion); ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, ssize_t direct_written, ssize_t buffered_written) { struct address_space *mapping = iocb->ki_filp->f_mapping; loff_t pos = iocb->ki_pos - buffered_written; loff_t end = iocb->ki_pos - 1; int err; /* * If the buffered write fallback returned an error, we want to return * the number of bytes which were written by direct I/O, or the error * code if that was zero. * * Note that this differs from normal direct-io semantics, which will * return -EFOO even if some bytes were written. */ if (unlikely(buffered_written < 0)) { if (direct_written) return direct_written; return buffered_written; } /* * We need to ensure that the page cache pages are written to disk and * invalidated to preserve the expected O_DIRECT semantics. */ err = filemap_write_and_wait_range(mapping, pos, end); if (err < 0) { /* * We don't know how much we wrote, so just return the number of * bytes which were direct-written */ iocb->ki_pos -= buffered_written; if (direct_written) return direct_written; return err; } invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); return direct_written + buffered_written; } EXPORT_SYMBOL_GPL(direct_write_fallback); /** * simple_inode_init_ts - initialize the timestamps for a new inode * @inode: inode to be initialized * * When a new inode is created, most filesystems set the timestamps to the * current time. Add a helper to do this. */ struct timespec64 simple_inode_init_ts(struct inode *inode) { struct timespec64 ts = inode_set_ctime_current(inode); inode_set_atime_to_ts(inode, ts); inode_set_mtime_to_ts(inode, ts); return ts; } EXPORT_SYMBOL(simple_inode_init_ts); struct dentry *stashed_dentry_get(struct dentry **stashed) { struct dentry *dentry; guard(rcu)(); dentry = rcu_dereference(*stashed); if (!dentry) return NULL; if (IS_ERR(dentry)) return dentry; if (!lockref_get_not_dead(&dentry->d_lockref)) return NULL; return dentry; } static struct dentry *prepare_anon_dentry(struct dentry **stashed, struct super_block *sb, void *data) { struct dentry *dentry; struct inode *inode; const struct stashed_operations *sops = sb->s_fs_info; int ret; inode = new_inode_pseudo(sb); if (!inode) { sops->put_data(data); return ERR_PTR(-ENOMEM); } inode->i_flags |= S_IMMUTABLE; inode->i_mode = S_IFREG; simple_inode_init_ts(inode); ret = sops->init_inode(inode, data); if (ret < 0) { iput(inode); return ERR_PTR(ret); } /* Notice when this is changed. */ WARN_ON_ONCE(!S_ISREG(inode->i_mode)); dentry = d_alloc_anon(sb); if (!dentry) { iput(inode); return ERR_PTR(-ENOMEM); } /* Store address of location where dentry's supposed to be stashed. */ dentry->d_fsdata = stashed; /* @data is now owned by the fs */ d_instantiate(dentry, inode); return dentry; } struct dentry *stash_dentry(struct dentry **stashed, struct dentry *dentry) { guard(rcu)(); for (;;) { struct dentry *old; /* Assume any old dentry was cleared out. */ old = cmpxchg(stashed, NULL, dentry); if (likely(!old)) return dentry; /* Check if somebody else installed a reusable dentry. */ if (lockref_get_not_dead(&old->d_lockref)) return old; /* There's an old dead dentry there, try to take it over. */ if (likely(try_cmpxchg(stashed, &old, dentry))) return dentry; } } /** * path_from_stashed - create path from stashed or new dentry * @stashed: where to retrieve or stash dentry * @mnt: mnt of the filesystems to use * @data: data to store in inode->i_private * @path: path to create * * The function tries to retrieve a stashed dentry from @stashed. If the dentry * is still valid then it will be reused. If the dentry isn't able the function * will allocate a new dentry and inode. It will then check again whether it * can reuse an existing dentry in case one has been added in the meantime or * update @stashed with the newly added dentry. * * Special-purpose helper for nsfs and pidfs. * * Return: On success zero and on failure a negative error is returned. */ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path) { struct dentry *dentry, *res; const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ res = stashed_dentry_get(stashed); if (IS_ERR(res)) return PTR_ERR(res); if (res) { sops->put_data(data); goto make_path; } /* Allocate a new dentry. */ dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data); if (IS_ERR(dentry)) return PTR_ERR(dentry); /* Added a new dentry. @data is now owned by the filesystem. */ if (sops->stash_dentry) res = sops->stash_dentry(stashed, dentry); else res = stash_dentry(stashed, dentry); if (IS_ERR(res)) { dput(dentry); return PTR_ERR(res); } if (res != dentry) dput(dentry); make_path: path->dentry = res; path->mnt = mntget(mnt); VFS_WARN_ON_ONCE(path->dentry->d_fsdata != stashed); VFS_WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); return 0; } void stashed_dentry_prune(struct dentry *dentry) { struct dentry **stashed = dentry->d_fsdata; struct inode *inode = d_inode(dentry); if (WARN_ON_ONCE(!stashed)) return; if (!inode) return; /* * Only replace our own @dentry as someone else might've * already cleared out @dentry and stashed their own * dentry in there. */ cmpxchg(stashed, dentry, NULL); } /* parent must be held exclusive */ struct dentry *simple_start_creating(struct dentry *parent, const char *name) { struct dentry *dentry; struct inode *dir = d_inode(parent); inode_lock(dir); if (unlikely(IS_DEADDIR(dir))) { inode_unlock(dir); return ERR_PTR(-ENOENT); } dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) { inode_unlock(dir); return dentry; } if (dentry->d_inode) { dput(dentry); inode_unlock(dir); return ERR_PTR(-EEXIST); } return dentry; } EXPORT_SYMBOL(simple_start_creating);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 // SPDX-License-Identifier: GPL-2.0-only /* Tom Kelly's Scalable TCP * * See http://www.deneholme.net/tom/scalable/ * * John Heffner <jheffner@sc.edu> */ #include <linux/module.h> #include <net/tcp.h> /* These factors derived from the recommended values in the aer: * .01 and 7/8. */ #define TCP_SCALABLE_AI_CNT 100U #define TCP_SCALABLE_MD_SCALE 3 static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) { struct tcp_sock *tp = tcp_sk(sk); if (!tcp_is_cwnd_limited(sk)) return; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); if (!acked) return; } tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT), acked); } static u32 tcp_scalable_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return max(tcp_snd_cwnd(tp) - (tcp_snd_cwnd(tp)>>TCP_SCALABLE_MD_SCALE), 2U); } static struct tcp_congestion_ops tcp_scalable __read_mostly = { .ssthresh = tcp_scalable_ssthresh, .undo_cwnd = tcp_reno_undo_cwnd, .cong_avoid = tcp_scalable_cong_avoid, .owner = THIS_MODULE, .name = "scalable", }; static int __init tcp_scalable_register(void) { return tcp_register_congestion_control(&tcp_scalable); } static void __exit tcp_scalable_unregister(void) { tcp_unregister_congestion_control(&tcp_scalable); } module_init(tcp_scalable_register); module_exit(tcp_scalable_unregister); MODULE_AUTHOR("John Heffner"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Scalable TCP");
1510 48441 207 3643 2112 160 3 53 7939 105 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CGROUP_H #define _LINUX_CGROUP_H /* * cgroup interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/nodemask.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/kernfs.h> #include <linux/jump_label.h> #include <linux/types.h> #include <linux/notifier.h> #include <linux/ns_common.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/kernel_stat.h> #include <linux/cgroup-defs.h> #include <linux/cgroup_namespace.h> struct kernel_clone_args; /* * All weight knobs on the default hierarchy should use the following min, * default and max values. The default value is the logarithmic center of * MIN and MAX and allows 100x to be expressed in both directions. */ #define CGROUP_WEIGHT_MIN 1 #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 #ifdef CONFIG_CGROUPS enum css_task_iter_flags { CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ CSS_TASK_ITER_SKIPPED = (1U << 16), /* internal flags */ }; /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; unsigned int flags; struct list_head *cset_pos; struct list_head *cset_head; struct list_head *tcset_pos; struct list_head *tcset_head; struct list_head *task_pos; struct list_head *cur_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ }; enum cgroup_lifetime_events { CGROUP_LIFETIME_ONLINE, CGROUP_LIFETIME_OFFLINE, }; extern struct file_system_type cgroup_fs_type; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; extern spinlock_t css_set_lock; extern struct blocking_notifier_head cgroup_lifetime_notifier; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) \ extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \ extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key; #include <linux/cgroup_subsys.h> #undef SUBSYS /** * cgroup_subsys_enabled - fast test on whether a subsys is enabled * @ss: subsystem in question */ #define cgroup_subsys_enabled(ss) \ static_branch_likely(&ss ## _enabled_key) /** * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy * @ss: subsystem in question */ #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); struct cgroup *cgroup_v1v2_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); void cgroup_file_show(struct cgroup_file *cfile, bool show); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); extern int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v); /* * Iteration helpers and macros. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent); struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); /** * css_for_each_child - iterate through children of a css * @pos: the css * to use as the loop cursor * @parent: css whose children to walk * * Walk @parent's children. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_child(pos, parent) \ for ((pos) = css_next_child(NULL, (parent)); (pos); \ (pos) = css_next_child((pos), (parent))) /** * css_for_each_descendant_pre - pre-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @root: css whose descendants to walk * * Walk @root's descendants. @root is included in the iteration and the * first node to be visited. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * For example, the following guarantees that a descendant can't escape * state updates of its ancestors. * * my_online(@css) * { * Lock @css's parent and @css; * Inherit state from the parent; * Unlock both. * } * * my_update_state(@css) * { * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; * if (@pos == @css) * Update @css's state; * else * Verify @pos is alive and inherit state from its parent; * Unlock @pos; * } * } * * As long as the inheriting step, including checking the parent state, is * enclosed inside @pos locking, double-locking the parent isn't necessary * while inheriting. The state update to the parent is guaranteed to be * visible by walking order and, as long as inheriting operations to the * same @pos are atomic to each other, multiple updates racing each other * still result in the correct state. It's guaranateed that at least one * inheritance happens for any css after the latest update to its parent. * * If checking parent's state requires locking the parent, each inheriting * iteration should lock and unlock both @pos->parent and @pos. * * Alternatively, a subsystem may choose to use a single global lock to * synchronize ->css_online() and ->css_offline() against tree-walking * operations. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_descendant_pre(pos, css) \ for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ (pos) = css_next_descendant_pre((pos), (css))) /** * css_for_each_descendant_post - post-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @css: css whose descendants to walk * * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. @root is included in the iteration and the last * node to be visited. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * Note that the walk visibility guarantee example described in pre-order * walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple * processes. * * On the v2 hierarchy, there may be tasks from multiple processes and they * may not share the source or destination csses. * * On traditional hierarchies, when there are multiple tasks in @tset, if a * task of a process is in @tset, all tasks of the process are in @tset. * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ #define cgroup_taskset_for_each(task, dst_css, tset) \ for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ (task); \ (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ #define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ (leader); \ (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else /* * Inline functions. */ #ifdef CONFIG_DEBUG_CGROUP_REF void css_get(struct cgroup_subsys_state *css); void css_get_many(struct cgroup_subsys_state *css, unsigned int n); bool css_tryget(struct cgroup_subsys_state *css); bool css_tryget_online(struct cgroup_subsys_state *css); void css_put(struct cgroup_subsys_state *css); void css_put_many(struct cgroup_subsys_state *css, unsigned int n); #else #define CGROUP_REF_FN_ATTRS static inline #define CGROUP_REF_EXPORT(fn) #include <linux/cgroup_refcnt.h> #endif static inline u64 cgroup_id(const struct cgroup *cgrp) { return cgrp->kn->id; } /** * css_is_dying - test whether the specified css is dying * @css: target css * * Test whether @css is in the process of offlining or already offline. In * most cases, ->css_online() and ->css_offline() callbacks should be * enough; however, the actual offline operations are RCU delayed and this * test returns %true also when @css is scheduled to be offlined. * * This is useful, for example, when the use case requires synchronous * behavior with respect to cgroup removal. cgroup removal schedules css * offlining but the css can seem alive while the operation is being * delayed. If the delay affects user visible semantics, this test can be * used to resolve the situation. */ static inline bool css_is_dying(struct cgroup_subsys_state *css) { return css->flags & CSS_DYING; } static inline bool css_is_online(struct cgroup_subsys_state *css) { return css->flags & CSS_ONLINE; } static inline bool css_is_self(struct cgroup_subsys_state *css) { if (css == &css->cgroup->self) { /* cgroup::self should not have subsystem association */ WARN_ON(css->ss != NULL); return true; } return false; } static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); } static inline bool cgroup_tryget(struct cgroup *cgrp) { return css_tryget(&cgrp->self); } static inline void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); } extern struct mutex cgroup_mutex; static inline void cgroup_lock(void) { mutex_lock(&cgroup_mutex); } static inline void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); } /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for * @__c: extra condition expression to be passed to rcu_dereference_check() * * A task's css_set is RCU protected, initialized and exited while holding * task_lock(), and can only be modified while holding both cgroup_mutex * and task_lock() while the task is alive. This macro verifies that the * caller is inside proper critical section and returns @task's css_set. * * The caller can also specify additional allowed conditions via @__c, such * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ rcu_read_lock_sched_held() || \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) #else #define task_css_set_check(task, __c) \ rcu_dereference((task)->cgroups) #endif /** * task_css_check - obtain css for (task, subsys) w/ extra access conds * @task: the target task * @subsys_id: the target subsystem ID * @__c: extra condition expression to be passed to rcu_dereference_check() * * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The * synchronization rules are the same as task_css_set_check(). */ #define task_css_check(task, subsys_id, __c) \ task_css_set_check((task), (__c))->subsys[(subsys_id)] /** * task_css_set - obtain a task's css_set * @task: the task to obtain css_set for * * See task_css_set_check(). */ static inline struct css_set *task_css_set(struct task_struct *task) { return task_css_set_check(task, false); } /** * task_css - obtain css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * See task_css_check(). */ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, false); } /** * task_get_css - find and get the css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * Find the css for the (@task, @subsys_id) combination, increment a * reference on and return it. This function is guaranteed to return a * valid css. The returned css may already have been offlined. */ static inline struct cgroup_subsys_state * task_get_css(struct task_struct *task, int subsys_id) { struct cgroup_subsys_state *css; rcu_read_lock(); while (true) { css = task_css(task, subsys_id); /* * Can't use css_tryget_online() here. A task which has * PF_EXITING set may stay associated with an offline css. * If such task calls this function, css_tryget_online() * will keep failing. */ if (likely(css_tryget(css))) break; cpu_relax(); } rcu_read_unlock(); return css; } /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task * @subsys_id: the target subsystem ID * * Test whether @task belongs to the root css on the specified subsystem. * May be invoked in any context. */ static inline bool task_css_is_root(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, true) == init_css_set.subsys[subsys_id]; } static inline struct cgroup *task_cgroup(struct task_struct *task, int subsys_id) { return task_css(task, subsys_id)->cgroup; } static inline struct cgroup *task_dfl_cgroup(struct task_struct *task) { return task_css_set(task)->dfl_cgrp; } static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { struct cgroup_subsys_state *parent_css = cgrp->self.parent; if (parent_css) return container_of(parent_css, struct cgroup, self); return NULL; } /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested * @ancestor: possible ancestor of @cgrp * * Test whether @cgrp is a descendant of @ancestor. It also returns %true * if @cgrp == @ancestor. This function is safe to call as long as @cgrp * and @ancestor are accessible. */ static inline bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; return cgrp->ancestors[ancestor->level] == ancestor; } /** * cgroup_ancestor - find ancestor of cgroup * @cgrp: cgroup to find ancestor of * @ancestor_level: level of ancestor to find starting from root * * Find ancestor of cgroup at specified level starting from root if it exists * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at * @ancestor_level. * * This function is safe to call as long as @cgrp is accessible. */ static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, int ancestor_level) { if (ancestor_level < 0 || ancestor_level > cgrp->level) return NULL; return cgrp->ancestors[ancestor_level]; } /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup * * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. * It follows all the same rules as cgroup_is_descendant, and only applies * to the default hierarchy. */ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { struct css_set *cset = task_css_set(task); return cgroup_is_descendant(cset->dfl_cgrp, ancestor); } /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + cgrp->nr_populated_threaded_children; } /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ static inline struct cftype *of_cft(struct kernfs_open_file *of) { return of->kn->priv; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); /* cft/css accessors for cftype->seq_*() operations */ static inline struct cftype *seq_cft(struct seq_file *seq) { return of_cft(seq->private); } static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) { return of_css(seq->private); } /* * Name / path handling functions. All are thin wrappers around the kernfs * counterparts and can be called under any context. */ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_name(cgrp->kn, buf, buflen); } static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_path(cgrp->kn, buf, buflen); } static inline void pr_cont_cgroup_name(struct cgroup *cgrp) { pr_cont_kernfs_name(cgrp->kn); } static inline void pr_cont_cgroup_path(struct cgroup *cgrp) { pr_cont_kernfs_path(cgrp->kn); } bool cgroup_psi_enabled(void); static inline void cgroup_init_kthreadd(void) { /* * kthreadd is inherited by all kthreads, keep it in the root so * that the new kthreads are guaranteed to stay in the root until * initialization is finished. */ current->no_cgroup_migration = 1; } static inline void cgroup_kthread_ready(void) { /* * This kthread finished initialization. The creator should have * set PF_NO_SETAFFINITY if this kthread should stay in the root. */ current->no_cgroup_migration = 0; } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); struct cgroup *__cgroup_get_from_id(u64 id); struct cgroup *cgroup_get_from_id(u64 id); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline void cgroup_lock(void) {} static inline void cgroup_unlock(void) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} static inline int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { return NULL; } static inline bool cgroup_psi_enabled(void) { return false; } static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { return true; } static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS /* * cgroup scalable recursive statistics. */ void css_rstat_updated(struct cgroup_subsys_state *css, int cpu); void css_rstat_flush(struct cgroup_subsys_state *css); /* * Basic resource stats. */ #ifdef CONFIG_CGROUP_CPUACCT void cpuacct_charge(struct task_struct *tsk, u64 cputime); void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} static inline void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) {} #endif void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec); static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) { struct cgroup *cgrp; cpuacct_charge(task, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime(cgrp, delta_exec); } static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup *cgrp; cpuacct_account_field(task, index, delta_exec); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime_field(cgrp, index, delta_exec); } #else /* CONFIG_CGROUPS */ static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) {} static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) {} #endif /* CONFIG_CGROUPS */ /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { return skcd->cgroup; } #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ #ifdef CONFIG_CGROUPS void cgroup_enter_frozen(void); void cgroup_leave_frozen(bool always_leave); void cgroup_update_frozen(struct cgroup *cgrp); void cgroup_freeze(struct cgroup *cgrp, bool freeze); void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst); static inline bool cgroup_task_frozen(struct task_struct *task) { return task->frozen; } #else /* !CONFIG_CGROUPS */ static inline void cgroup_enter_frozen(void) { } static inline void cgroup_leave_frozen(bool always_leave) { } static inline bool cgroup_task_frozen(struct task_struct *task) { return false; } #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUP_BPF static inline void cgroup_bpf_get(struct cgroup *cgrp) { percpu_ref_get(&cgrp->bpf.refcnt); } static inline void cgroup_bpf_put(struct cgroup *cgrp) { percpu_ref_put(&cgrp->bpf.refcnt); } #else /* CONFIG_CGROUP_BPF */ static inline void cgroup_bpf_get(struct cgroup *cgrp) {} static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ struct cgroup *task_get_cgroup1(struct task_struct *tsk, int hierarchy_id); struct cgroup_of_peak *of_peak(struct kernfs_open_file *of); #endif /* _LINUX_CGROUP_H */
269 271 3 3 2 2 1 1 36 51 1 2 13 36 1 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 // SPDX-License-Identifier: GPL-2.0-only /* * scsi_error.c Copyright (C) 1997 Eric Youngdale * * SCSI error/timeout handling * Initial versions: Eric Youngdale. Based upon conversations with * Leonard Zubkoff and David Miller at Linux Expo, * ideas originating from all over the place. * * Restructured scsi_unjam_host and associated functions. * September 04, 2002 Mike Anderson (andmike@us.ibm.com) * * Forward port of Russell King's (rmk@arm.linux.org.uk) changes and * minor cleanups. * September 30, 2002 Mike Anderson (andmike@us.ibm.com) */ #include <linux/module.h> #include <linux/sched.h> #include <linux/gfp.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/interrupt.h> #include <linux/blkdev.h> #include <linux/delay.h> #include <linux/jiffies.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_common.h> #include <scsi/scsi_transport.h> #include <scsi/scsi_host.h> #include <scsi/scsi_ioctl.h> #include <scsi/scsi_dh.h> #include <scsi/scsi_devinfo.h> #include <scsi/sg.h> #include "scsi_priv.h" #include "scsi_logging.h" #include "scsi_transport_api.h" #include <trace/events/scsi.h> #include <linux/unaligned.h> /* * These should *probably* be handled by the host itself. * Since it is allowed to sleep, it probably should. */ #define BUS_RESET_SETTLE_TIME (10) #define HOST_RESET_SETTLE_TIME (10) static int scsi_eh_try_stu(struct scsi_cmnd *scmd); static enum scsi_disposition scsi_try_to_abort_cmd(const struct scsi_host_template *, struct scsi_cmnd *); void scsi_eh_wakeup(struct Scsi_Host *shost, unsigned int busy) { lockdep_assert_held(shost->host_lock); if (busy == shost->host_failed) { trace_scsi_eh_wakeup(shost); wake_up_process(shost->ehandler); SCSI_LOG_ERROR_RECOVERY(5, shost_printk(KERN_INFO, shost, "Waking error handler thread\n")); } } /** * scsi_schedule_eh - schedule EH for SCSI host * @shost: SCSI host to invoke error handling on. * * Schedule SCSI EH without scmd. */ void scsi_schedule_eh(struct Scsi_Host *shost) { unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 || scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) { shost->host_eh_scheduled++; scsi_eh_wakeup(shost, scsi_host_busy(shost)); } spin_unlock_irqrestore(shost->host_lock, flags); } EXPORT_SYMBOL_GPL(scsi_schedule_eh); static int scsi_host_eh_past_deadline(struct Scsi_Host *shost) { if (!shost->last_reset || shost->eh_deadline == -1) return 0; /* * 32bit accesses are guaranteed to be atomic * (on all supported architectures), so instead * of using a spinlock we can as well double check * if eh_deadline has been set to 'off' during the * time_before call. */ if (time_before(jiffies, shost->last_reset + shost->eh_deadline) && shost->eh_deadline > -1) return 0; return 1; } static bool scsi_cmd_retry_allowed(struct scsi_cmnd *cmd) { if (cmd->allowed == SCSI_CMD_RETRIES_NO_LIMIT) return true; return ++cmd->retries <= cmd->allowed; } static bool scsi_eh_should_retry_cmd(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct Scsi_Host *host = sdev->host; if (host->hostt->eh_should_retry_cmd) return host->hostt->eh_should_retry_cmd(cmd); return true; } /** * scmd_eh_abort_handler - Handle command aborts * @work: command to be aborted. * * Note: this function must be called only for a command that has timed out. * Because the block layer marks a request as complete before it calls * scsi_timeout(), a .scsi_done() call from the LLD for a command that has * timed out do not have any effect. Hence it is safe to call * scsi_finish_command() from this function. */ void scmd_eh_abort_handler(struct work_struct *work) { struct scsi_cmnd *scmd = container_of(work, struct scsi_cmnd, abort_work.work); struct scsi_device *sdev = scmd->device; struct Scsi_Host *shost = sdev->host; enum scsi_disposition rtn; unsigned long flags; if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "eh timeout, not aborting\n")); goto out; } SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "aborting command\n")); rtn = scsi_try_to_abort_cmd(shost->hostt, scmd); if (rtn != SUCCESS) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "cmd abort %s\n", (rtn == FAST_IO_FAIL) ? "not send" : "failed")); goto out; } set_host_byte(scmd, DID_TIME_OUT); if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "eh timeout, not retrying " "aborted command\n")); goto out; } spin_lock_irqsave(shost->host_lock, flags); list_del_init(&scmd->eh_entry); /* * If the abort succeeds, and there is no further * EH action, clear the ->last_reset time. */ if (list_empty(&shost->eh_abort_list) && list_empty(&shost->eh_cmd_q)) if (shost->eh_deadline != -1) shost->last_reset = 0; spin_unlock_irqrestore(shost->host_lock, flags); if (!scsi_noretry_cmd(scmd) && scsi_cmd_retry_allowed(scmd) && scsi_eh_should_retry_cmd(scmd)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_WARNING, scmd, "retry aborted command\n")); scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY); } else { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_WARNING, scmd, "finish aborted command\n")); scsi_finish_command(scmd); } return; out: spin_lock_irqsave(shost->host_lock, flags); list_del_init(&scmd->eh_entry); spin_unlock_irqrestore(shost->host_lock, flags); scsi_eh_scmd_add(scmd); } /** * scsi_abort_command - schedule a command abort * @scmd: scmd to abort. * * We only need to abort commands after a command timeout */ static int scsi_abort_command(struct scsi_cmnd *scmd) { struct scsi_device *sdev = scmd->device; struct Scsi_Host *shost = sdev->host; unsigned long flags; if (!shost->hostt->eh_abort_handler) { /* No abort handler, fail command directly */ return FAILED; } if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) { /* * Retry after abort failed, escalate to next level. */ SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "previous abort failed\n")); BUG_ON(delayed_work_pending(&scmd->abort_work)); return FAILED; } spin_lock_irqsave(shost->host_lock, flags); if (shost->eh_deadline != -1 && !shost->last_reset) shost->last_reset = jiffies; BUG_ON(!list_empty(&scmd->eh_entry)); list_add_tail(&scmd->eh_entry, &shost->eh_abort_list); spin_unlock_irqrestore(shost->host_lock, flags); scmd->eh_eflags |= SCSI_EH_ABORT_SCHEDULED; SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "abort scheduled\n")); queue_delayed_work(shost->tmf_work_q, &scmd->abort_work, HZ / 100); return SUCCESS; } /** * scsi_eh_reset - call into ->eh_action to reset internal counters * @scmd: scmd to run eh on. * * The scsi driver might be carrying internal state about the * devices, so we need to call into the driver to reset the * internal state once the error handler is started. */ static void scsi_eh_reset(struct scsi_cmnd *scmd) { if (!blk_rq_is_passthrough(scsi_cmd_to_rq(scmd))) { struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); if (sdrv->eh_reset) sdrv->eh_reset(scmd); } } static void scsi_eh_inc_host_failed(struct rcu_head *head) { struct scsi_cmnd *scmd = container_of(head, typeof(*scmd), rcu); struct Scsi_Host *shost = scmd->device->host; unsigned int busy = scsi_host_busy(shost); unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); shost->host_failed++; scsi_eh_wakeup(shost, busy); spin_unlock_irqrestore(shost->host_lock, flags); } /** * scsi_eh_scmd_add - add scsi cmd to error handling. * @scmd: scmd to run eh on. */ void scsi_eh_scmd_add(struct scsi_cmnd *scmd) { struct Scsi_Host *shost = scmd->device->host; unsigned long flags; int ret; WARN_ON_ONCE(!shost->ehandler); WARN_ON_ONCE(!test_bit(SCMD_STATE_INFLIGHT, &scmd->state)); spin_lock_irqsave(shost->host_lock, flags); if (scsi_host_set_state(shost, SHOST_RECOVERY)) { ret = scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY); WARN_ON_ONCE(ret); } if (shost->eh_deadline != -1 && !shost->last_reset) shost->last_reset = jiffies; scsi_eh_reset(scmd); list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q); spin_unlock_irqrestore(shost->host_lock, flags); /* * Ensure that all tasks observe the host state change before the * host_failed change. */ call_rcu_hurry(&scmd->rcu, scsi_eh_inc_host_failed); } /** * scsi_timeout - Timeout function for normal scsi commands. * @req: request that is timing out. * * Notes: * We do not need to lock this. There is the potential for a race * only in that the normal completion handling might run, but if the * normal completion function determines that the timer has already * fired, then it mustn't do anything. */ enum blk_eh_timer_return scsi_timeout(struct request *req) { struct scsi_cmnd *scmd = blk_mq_rq_to_pdu(req); struct Scsi_Host *host = scmd->device->host; trace_scsi_dispatch_cmd_timeout(scmd); scsi_log_completion(scmd, TIMEOUT_ERROR); atomic_inc(&scmd->device->iotmo_cnt); if (host->eh_deadline != -1 && !host->last_reset) host->last_reset = jiffies; if (host->hostt->eh_timed_out) { switch (host->hostt->eh_timed_out(scmd)) { case SCSI_EH_DONE: return BLK_EH_DONE; case SCSI_EH_RESET_TIMER: return BLK_EH_RESET_TIMER; case SCSI_EH_NOT_HANDLED: break; } } /* * If scsi_done() has already set SCMD_STATE_COMPLETE, do not modify * *scmd. */ if (test_and_set_bit(SCMD_STATE_COMPLETE, &scmd->state)) return BLK_EH_DONE; atomic_inc(&scmd->device->iodone_cnt); if (scsi_abort_command(scmd) != SUCCESS) { set_host_byte(scmd, DID_TIME_OUT); scsi_eh_scmd_add(scmd); } return BLK_EH_DONE; } /** * scsi_block_when_processing_errors - Prevent cmds from being queued. * @sdev: Device on which we are performing recovery. * * Description: * We block until the host is out of error recovery, and then check to * see whether the host or the device is offline. * * Return value: * 0 when dev was taken offline by error recovery. 1 OK to proceed. */ int scsi_block_when_processing_errors(struct scsi_device *sdev) { int online; wait_event(sdev->host->host_wait, !scsi_host_in_recovery(sdev->host)); online = scsi_device_online(sdev); return online; } EXPORT_SYMBOL(scsi_block_when_processing_errors); #ifdef CONFIG_SCSI_LOGGING /** * scsi_eh_prt_fail_stats - Log info on failures. * @shost: scsi host being recovered. * @work_q: Queue of scsi cmds to process. */ static inline void scsi_eh_prt_fail_stats(struct Scsi_Host *shost, struct list_head *work_q) { struct scsi_cmnd *scmd; struct scsi_device *sdev; int total_failures = 0; int cmd_failed = 0; int cmd_cancel = 0; int devices_failed = 0; shost_for_each_device(sdev, shost) { list_for_each_entry(scmd, work_q, eh_entry) { if (scmd->device == sdev) { ++total_failures; if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) ++cmd_cancel; else ++cmd_failed; } } if (cmd_cancel || cmd_failed) { SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: cmds failed: %d, cancel: %d\n", __func__, cmd_failed, cmd_cancel)); cmd_cancel = 0; cmd_failed = 0; ++devices_failed; } } SCSI_LOG_ERROR_RECOVERY(2, shost_printk(KERN_INFO, shost, "Total of %d commands on %d" " devices require eh work\n", total_failures, devices_failed)); } #endif /** * scsi_report_lun_change - Set flag on all *other* devices on the same target * to indicate that a UNIT ATTENTION is expected. * @sdev: Device reporting the UNIT ATTENTION */ static void scsi_report_lun_change(struct scsi_device *sdev) { sdev->sdev_target->expecting_lun_change = 1; } /** * scsi_report_sense - Examine scsi sense information and log messages for * certain conditions, also issue uevents for some of them. * @sdev: Device reporting the sense code * @sshdr: sshdr to be examined */ static void scsi_report_sense(struct scsi_device *sdev, struct scsi_sense_hdr *sshdr) { enum scsi_device_event evt_type = SDEV_EVT_MAXBITS; /* i.e. none */ if (sshdr->sense_key == UNIT_ATTENTION) { if (sshdr->asc == 0x3f && sshdr->ascq == 0x03) { evt_type = SDEV_EVT_INQUIRY_CHANGE_REPORTED; sdev_printk(KERN_WARNING, sdev, "Inquiry data has changed"); } else if (sshdr->asc == 0x3f && sshdr->ascq == 0x0e) { evt_type = SDEV_EVT_LUN_CHANGE_REPORTED; scsi_report_lun_change(sdev); sdev_printk(KERN_WARNING, sdev, "LUN assignments on this target have " "changed. The Linux SCSI layer does not " "automatically remap LUN assignments.\n"); } else if (sshdr->asc == 0x3f) sdev_printk(KERN_WARNING, sdev, "Operating parameters on this target have " "changed. The Linux SCSI layer does not " "automatically adjust these parameters.\n"); if (sshdr->asc == 0x38 && sshdr->ascq == 0x07) { evt_type = SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED; sdev_printk(KERN_WARNING, sdev, "Warning! Received an indication that the " "LUN reached a thin provisioning soft " "threshold.\n"); } if (sshdr->asc == 0x29) { evt_type = SDEV_EVT_POWER_ON_RESET_OCCURRED; /* * Do not print message if it is an expected side-effect * of runtime PM. */ if (!sdev->silence_suspend) sdev_printk(KERN_WARNING, sdev, "Power-on or device reset occurred\n"); } if (sshdr->asc == 0x2a && sshdr->ascq == 0x01) { evt_type = SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED; sdev_printk(KERN_WARNING, sdev, "Mode parameters changed"); } else if (sshdr->asc == 0x2a && sshdr->ascq == 0x06) { evt_type = SDEV_EVT_ALUA_STATE_CHANGE_REPORTED; sdev_printk(KERN_WARNING, sdev, "Asymmetric access state changed"); } else if (sshdr->asc == 0x2a && sshdr->ascq == 0x09) { evt_type = SDEV_EVT_CAPACITY_CHANGE_REPORTED; sdev_printk(KERN_WARNING, sdev, "Capacity data has changed"); } else if (sshdr->asc == 0x2a) sdev_printk(KERN_WARNING, sdev, "Parameters changed"); } if (evt_type != SDEV_EVT_MAXBITS) { set_bit(evt_type, sdev->pending_events); schedule_work(&sdev->event_work); } } static inline void set_scsi_ml_byte(struct scsi_cmnd *cmd, u8 status) { cmd->result = (cmd->result & 0xffff00ff) | (status << 8); } /** * scsi_check_sense - Examine scsi cmd sense * @scmd: Cmd to have sense checked. * * Return value: * SUCCESS or FAILED or NEEDS_RETRY or ADD_TO_MLQUEUE * * Notes: * When a deferred error is detected the current command has * not been executed and needs retrying. */ enum scsi_disposition scsi_check_sense(struct scsi_cmnd *scmd) { struct request *req = scsi_cmd_to_rq(scmd); struct scsi_device *sdev = scmd->device; struct scsi_sense_hdr sshdr; if (! scsi_command_normalize_sense(scmd, &sshdr)) return FAILED; /* no valid sense data */ scsi_report_sense(sdev, &sshdr); if (sshdr.sense_key == UNIT_ATTENTION) { /* * Increment the counters for Power on/Reset or New Media so * that all ULDs interested in these can see that those have * happened, even if someone else gets the sense data. */ if (sshdr.asc == 0x28) scmd->device->ua_new_media_ctr++; else if (sshdr.asc == 0x29) scmd->device->ua_por_ctr++; } if (scsi_sense_is_deferred(&sshdr)) return NEEDS_RETRY; if (sdev->handler && sdev->handler->check_sense) { enum scsi_disposition rc; rc = sdev->handler->check_sense(sdev, &sshdr); if (rc != SCSI_RETURN_NOT_HANDLED) return rc; /* handler does not care. Drop down to default handling */ } if (scmd->cmnd[0] == TEST_UNIT_READY && scmd->submitter != SUBMITTED_BY_SCSI_ERROR_HANDLER) /* * nasty: for mid-layer issued TURs, we need to return the * actual sense data without any recovery attempt. For eh * issued ones, we need to try to recover and interpret */ return SUCCESS; /* * Previous logic looked for FILEMARK, EOM or ILI which are * mainly associated with tapes and returned SUCCESS. */ if (sshdr.response_code == 0x70) { /* fixed format */ if (scmd->sense_buffer[2] & 0xe0) return SUCCESS; } else { /* * descriptor format: look for "stream commands sense data * descriptor" (see SSC-3). Assume single sense data * descriptor. Ignore ILI from SBC-2 READ LONG and WRITE LONG. */ if ((sshdr.additional_length > 3) && (scmd->sense_buffer[8] == 0x4) && (scmd->sense_buffer[11] & 0xe0)) return SUCCESS; } switch (sshdr.sense_key) { case NO_SENSE: return SUCCESS; case RECOVERED_ERROR: return /* soft_error */ SUCCESS; case ABORTED_COMMAND: if (sshdr.asc == 0x10) /* DIF */ return SUCCESS; /* * Check aborts due to command duration limit policy: * ABORTED COMMAND additional sense code with the * COMMAND TIMEOUT BEFORE PROCESSING or * COMMAND TIMEOUT DURING PROCESSING or * COMMAND TIMEOUT DURING PROCESSING DUE TO ERROR RECOVERY * additional sense code qualifiers. */ if (sshdr.asc == 0x2e && sshdr.ascq >= 0x01 && sshdr.ascq <= 0x03) { set_scsi_ml_byte(scmd, SCSIML_STAT_DL_TIMEOUT); req->cmd_flags |= REQ_FAILFAST_DEV; req->rq_flags |= RQF_QUIET; return SUCCESS; } if (sshdr.asc == 0x44 && sdev->sdev_bflags & BLIST_RETRY_ITF) return ADD_TO_MLQUEUE; if (sshdr.asc == 0xc1 && sshdr.ascq == 0x01 && sdev->sdev_bflags & BLIST_RETRY_ASC_C1) return ADD_TO_MLQUEUE; return NEEDS_RETRY; case NOT_READY: case UNIT_ATTENTION: /* * if we are expecting a cc/ua because of a bus reset that we * performed, treat this just as a retry. otherwise this is * information that we should pass up to the upper-level driver * so that we can deal with it there. */ if (scmd->device->expecting_cc_ua) { /* * Because some device does not queue unit * attentions correctly, we carefully check * additional sense code and qualifier so as * not to squash media change unit attention. */ if (sshdr.asc != 0x28 || sshdr.ascq != 0x00) { scmd->device->expecting_cc_ua = 0; return NEEDS_RETRY; } } /* * we might also expect a cc/ua if another LUN on the target * reported a UA with an ASC/ASCQ of 3F 0E - * REPORTED LUNS DATA HAS CHANGED. */ if (scmd->device->sdev_target->expecting_lun_change && sshdr.asc == 0x3f && sshdr.ascq == 0x0e) return NEEDS_RETRY; /* * if the device is in the process of becoming ready, we * should retry. */ if ((sshdr.asc == 0x04) && (sshdr.ascq == 0x01 || sshdr.ascq == 0x0a)) return NEEDS_RETRY; /* * if the device is not started, we need to wake * the error handler to start the motor */ if (scmd->device->allow_restart && (sshdr.asc == 0x04) && (sshdr.ascq == 0x02)) return FAILED; /* * Pass the UA upwards for a determination in the completion * functions. */ return SUCCESS; /* these are not supported */ case DATA_PROTECT: if (sshdr.asc == 0x27 && sshdr.ascq == 0x07) { /* Thin provisioning hard threshold reached */ set_scsi_ml_byte(scmd, SCSIML_STAT_NOSPC); return SUCCESS; } fallthrough; case COPY_ABORTED: case VOLUME_OVERFLOW: case MISCOMPARE: case BLANK_CHECK: set_scsi_ml_byte(scmd, SCSIML_STAT_TGT_FAILURE); return SUCCESS; case MEDIUM_ERROR: if (sshdr.asc == 0x11 || /* UNRECOVERED READ ERR */ sshdr.asc == 0x13 || /* AMNF DATA FIELD */ sshdr.asc == 0x14) { /* RECORD NOT FOUND */ set_scsi_ml_byte(scmd, SCSIML_STAT_MED_ERROR); return SUCCESS; } return NEEDS_RETRY; case HARDWARE_ERROR: if (scmd->device->retry_hwerror) return ADD_TO_MLQUEUE; else set_scsi_ml_byte(scmd, SCSIML_STAT_TGT_FAILURE); fallthrough; case ILLEGAL_REQUEST: if (sshdr.asc == 0x20 || /* Invalid command operation code */ sshdr.asc == 0x21 || /* Logical block address out of range */ sshdr.asc == 0x22 || /* Invalid function */ sshdr.asc == 0x24 || /* Invalid field in cdb */ sshdr.asc == 0x26 || /* Parameter value invalid */ sshdr.asc == 0x27) { /* Write protected */ set_scsi_ml_byte(scmd, SCSIML_STAT_TGT_FAILURE); } return SUCCESS; case COMPLETED: /* * A command using command duration limits (CDL) with a * descriptor set with policy 0xD may be completed with success * and the sense data DATA CURRENTLY UNAVAILABLE, indicating * that the command was in fact aborted because it exceeded its * duration limit. Never retry these commands. */ if (sshdr.asc == 0x55 && sshdr.ascq == 0x0a) { set_scsi_ml_byte(scmd, SCSIML_STAT_DL_TIMEOUT); req->cmd_flags |= REQ_FAILFAST_DEV; req->rq_flags |= RQF_QUIET; } return SUCCESS; default: return SUCCESS; } } EXPORT_SYMBOL_GPL(scsi_check_sense); static void scsi_handle_queue_ramp_up(struct scsi_device *sdev) { const struct scsi_host_template *sht = sdev->host->hostt; struct scsi_device *tmp_sdev; if (!sht->track_queue_depth || sdev->queue_depth >= sdev->max_queue_depth) return; if (time_before(jiffies, sdev->last_queue_ramp_up + sdev->queue_ramp_up_period)) return; if (time_before(jiffies, sdev->last_queue_full_time + sdev->queue_ramp_up_period)) return; /* * Walk all devices of a target and do * ramp up on them. */ shost_for_each_device(tmp_sdev, sdev->host) { if (tmp_sdev->channel != sdev->channel || tmp_sdev->id != sdev->id || tmp_sdev->queue_depth == sdev->max_queue_depth) continue; scsi_change_queue_depth(tmp_sdev, tmp_sdev->queue_depth + 1); sdev->last_queue_ramp_up = jiffies; } } static void scsi_handle_queue_full(struct scsi_device *sdev) { const struct scsi_host_template *sht = sdev->host->hostt; struct scsi_device *tmp_sdev; if (!sht->track_queue_depth) return; shost_for_each_device(tmp_sdev, sdev->host) { if (tmp_sdev->channel != sdev->channel || tmp_sdev->id != sdev->id) continue; /* * We do not know the number of commands that were at * the device when we got the queue full so we start * from the highest possible value and work our way down. */ scsi_track_queue_full(tmp_sdev, tmp_sdev->queue_depth - 1); } } /** * scsi_eh_completed_normally - Disposition a eh cmd on return from LLD. * @scmd: SCSI cmd to examine. * * Notes: * This is *only* called when we are examining the status of commands * queued during error recovery. the main difference here is that we * don't allow for the possibility of retries here, and we are a lot * more restrictive about what we consider acceptable. */ static enum scsi_disposition scsi_eh_completed_normally(struct scsi_cmnd *scmd) { /* * first check the host byte, to see if there is anything in there * that would indicate what we need to do. */ if (host_byte(scmd->result) == DID_RESET) { /* * rats. we are already in the error handler, so we now * get to try and figure out what to do next. if the sense * is valid, we have a pretty good idea of what to do. * if not, we mark it as FAILED. */ return scsi_check_sense(scmd); } if (host_byte(scmd->result) != DID_OK) return FAILED; /* * now, check the status byte to see if this indicates * anything special. */ switch (get_status_byte(scmd)) { case SAM_STAT_GOOD: scsi_handle_queue_ramp_up(scmd->device); if (scmd->sense_buffer && SCSI_SENSE_VALID(scmd)) /* * If we have sense data, call scsi_check_sense() in * order to set the correct SCSI ML byte (if any). * No point in checking the return value, since the * command has already completed successfully. */ scsi_check_sense(scmd); fallthrough; case SAM_STAT_COMMAND_TERMINATED: return SUCCESS; case SAM_STAT_CHECK_CONDITION: return scsi_check_sense(scmd); case SAM_STAT_CONDITION_MET: case SAM_STAT_INTERMEDIATE: case SAM_STAT_INTERMEDIATE_CONDITION_MET: /* * who knows? FIXME(eric) */ return SUCCESS; case SAM_STAT_RESERVATION_CONFLICT: if (scmd->cmnd[0] == TEST_UNIT_READY) /* it is a success, we probed the device and * found it */ return SUCCESS; /* otherwise, we failed to send the command */ return FAILED; case SAM_STAT_TASK_SET_FULL: scsi_handle_queue_full(scmd->device); fallthrough; case SAM_STAT_BUSY: return NEEDS_RETRY; default: return FAILED; } return FAILED; } /** * scsi_eh_done - Completion function for error handling. * @scmd: Cmd that is done. */ void scsi_eh_done(struct scsi_cmnd *scmd) { struct completion *eh_action; SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s result: %x\n", __func__, scmd->result)); eh_action = scmd->device->host->eh_action; if (eh_action) complete(eh_action); } /** * scsi_try_host_reset - ask host adapter to reset itself * @scmd: SCSI cmd to send host reset. */ static enum scsi_disposition scsi_try_host_reset(struct scsi_cmnd *scmd) { unsigned long flags; enum scsi_disposition rtn; struct Scsi_Host *host = scmd->device->host; const struct scsi_host_template *hostt = host->hostt; SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, host, "Snd Host RST\n")); if (!hostt->eh_host_reset_handler) return FAILED; rtn = hostt->eh_host_reset_handler(scmd); if (rtn == SUCCESS) { if (!hostt->skip_settle_delay) ssleep(HOST_RESET_SETTLE_TIME); spin_lock_irqsave(host->host_lock, flags); scsi_report_bus_reset(host, scmd_channel(scmd)); spin_unlock_irqrestore(host->host_lock, flags); } return rtn; } /** * scsi_try_bus_reset - ask host to perform a bus reset * @scmd: SCSI cmd to send bus reset. */ static enum scsi_disposition scsi_try_bus_reset(struct scsi_cmnd *scmd) { unsigned long flags; enum scsi_disposition rtn; struct Scsi_Host *host = scmd->device->host; const struct scsi_host_template *hostt = host->hostt; SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: Snd Bus RST\n", __func__)); if (!hostt->eh_bus_reset_handler) return FAILED; rtn = hostt->eh_bus_reset_handler(scmd); if (rtn == SUCCESS) { if (!hostt->skip_settle_delay) ssleep(BUS_RESET_SETTLE_TIME); spin_lock_irqsave(host->host_lock, flags); scsi_report_bus_reset(host, scmd_channel(scmd)); spin_unlock_irqrestore(host->host_lock, flags); } return rtn; } static void __scsi_report_device_reset(struct scsi_device *sdev, void *data) { sdev->was_reset = 1; sdev->expecting_cc_ua = 1; } /** * scsi_try_target_reset - Ask host to perform a target reset * @scmd: SCSI cmd used to send a target reset * * Notes: * There is no timeout for this operation. if this operation is * unreliable for a given host, then the host itself needs to put a * timer on it, and set the host back to a consistent state prior to * returning. */ static enum scsi_disposition scsi_try_target_reset(struct scsi_cmnd *scmd) { unsigned long flags; enum scsi_disposition rtn; struct Scsi_Host *host = scmd->device->host; const struct scsi_host_template *hostt = host->hostt; if (!hostt->eh_target_reset_handler) return FAILED; rtn = hostt->eh_target_reset_handler(scmd); if (rtn == SUCCESS) { spin_lock_irqsave(host->host_lock, flags); __starget_for_each_device(scsi_target(scmd->device), NULL, __scsi_report_device_reset); spin_unlock_irqrestore(host->host_lock, flags); } return rtn; } /** * scsi_try_bus_device_reset - Ask host to perform a BDR on a dev * @scmd: SCSI cmd used to send BDR * * Notes: * There is no timeout for this operation. if this operation is * unreliable for a given host, then the host itself needs to put a * timer on it, and set the host back to a consistent state prior to * returning. */ static enum scsi_disposition scsi_try_bus_device_reset(struct scsi_cmnd *scmd) { enum scsi_disposition rtn; const struct scsi_host_template *hostt = scmd->device->host->hostt; if (!hostt->eh_device_reset_handler) return FAILED; rtn = hostt->eh_device_reset_handler(scmd); if (rtn == SUCCESS) __scsi_report_device_reset(scmd->device, NULL); return rtn; } /** * scsi_try_to_abort_cmd - Ask host to abort a SCSI command * @hostt: SCSI driver host template * @scmd: SCSI cmd used to send a target reset * * Return value: * SUCCESS, FAILED, or FAST_IO_FAIL * * Notes: * SUCCESS does not necessarily indicate that the command * has been aborted; it only indicates that the LLDDs * has cleared all references to that command. * LLDDs should return FAILED only if an abort was required * but could not be executed. LLDDs should return FAST_IO_FAIL * if the device is temporarily unavailable (eg due to a * link down on FibreChannel) */ static enum scsi_disposition scsi_try_to_abort_cmd(const struct scsi_host_template *hostt, struct scsi_cmnd *scmd) { if (!hostt->eh_abort_handler) return FAILED; return hostt->eh_abort_handler(scmd); } static void scsi_abort_eh_cmnd(struct scsi_cmnd *scmd) { if (scsi_try_to_abort_cmd(scmd->device->host->hostt, scmd) != SUCCESS) if (scsi_try_bus_device_reset(scmd) != SUCCESS) if (scsi_try_target_reset(scmd) != SUCCESS) if (scsi_try_bus_reset(scmd) != SUCCESS) scsi_try_host_reset(scmd); } /** * scsi_eh_prep_cmnd - Save a scsi command info as part of error recovery * @scmd: SCSI command structure to hijack * @ses: structure to save restore information * @cmnd: CDB to send. Can be NULL if no new cmnd is needed * @cmnd_size: size in bytes of @cmnd (must be <= MAX_COMMAND_SIZE) * @sense_bytes: size of sense data to copy. or 0 (if != 0 @cmnd is ignored) * * This function is used to save a scsi command information before re-execution * as part of the error recovery process. If @sense_bytes is 0 the command * sent must be one that does not transfer any data. If @sense_bytes != 0 * @cmnd is ignored and this functions sets up a REQUEST_SENSE command * and cmnd buffers to read @sense_bytes into @scmd->sense_buffer. */ void scsi_eh_prep_cmnd(struct scsi_cmnd *scmd, struct scsi_eh_save *ses, unsigned char *cmnd, int cmnd_size, unsigned sense_bytes) { struct scsi_device *sdev = scmd->device; /* * We need saved copies of a number of fields - this is because * error handling may need to overwrite these with different values * to run different commands, and once error handling is complete, * we will need to restore these values prior to running the actual * command. */ ses->cmd_len = scmd->cmd_len; ses->data_direction = scmd->sc_data_direction; ses->sdb = scmd->sdb; ses->result = scmd->result; ses->resid_len = scmd->resid_len; ses->underflow = scmd->underflow; ses->prot_op = scmd->prot_op; ses->eh_eflags = scmd->eh_eflags; scmd->prot_op = SCSI_PROT_NORMAL; scmd->eh_eflags = 0; memcpy(ses->cmnd, scmd->cmnd, sizeof(ses->cmnd)); memset(scmd->cmnd, 0, sizeof(scmd->cmnd)); memset(&scmd->sdb, 0, sizeof(scmd->sdb)); scmd->result = 0; scmd->resid_len = 0; if (sense_bytes) { scmd->sdb.length = min_t(unsigned, SCSI_SENSE_BUFFERSIZE, sense_bytes); sg_init_one(&ses->sense_sgl, scmd->sense_buffer, scmd->sdb.length); scmd->sdb.table.sgl = &ses->sense_sgl; scmd->sc_data_direction = DMA_FROM_DEVICE; scmd->sdb.table.nents = scmd->sdb.table.orig_nents = 1; scmd->cmnd[0] = REQUEST_SENSE; scmd->cmnd[4] = scmd->sdb.length; scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); } else { scmd->sc_data_direction = DMA_NONE; if (cmnd) { BUG_ON(cmnd_size > sizeof(scmd->cmnd)); memcpy(scmd->cmnd, cmnd, cmnd_size); scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); } } scmd->underflow = 0; if (sdev->scsi_level <= SCSI_2 && sdev->scsi_level != SCSI_UNKNOWN) scmd->cmnd[1] = (scmd->cmnd[1] & 0x1f) | (sdev->lun << 5 & 0xe0); /* * Zero the sense buffer. The scsi spec mandates that any * untransferred sense data should be interpreted as being zero. */ memset(scmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); } EXPORT_SYMBOL(scsi_eh_prep_cmnd); /** * scsi_eh_restore_cmnd - Restore a scsi command info as part of error recovery * @scmd: SCSI command structure to restore * @ses: saved information from a coresponding call to scsi_eh_prep_cmnd * * Undo any damage done by above scsi_eh_prep_cmnd(). */ void scsi_eh_restore_cmnd(struct scsi_cmnd* scmd, struct scsi_eh_save *ses) { /* * Restore original data */ scmd->cmd_len = ses->cmd_len; memcpy(scmd->cmnd, ses->cmnd, sizeof(ses->cmnd)); scmd->sc_data_direction = ses->data_direction; scmd->sdb = ses->sdb; scmd->result = ses->result; scmd->resid_len = ses->resid_len; scmd->underflow = ses->underflow; scmd->prot_op = ses->prot_op; scmd->eh_eflags = ses->eh_eflags; } EXPORT_SYMBOL(scsi_eh_restore_cmnd); /** * scsi_send_eh_cmnd - submit a scsi command as part of error recovery * @scmd: SCSI command structure to hijack * @cmnd: CDB to send * @cmnd_size: size in bytes of @cmnd * @timeout: timeout for this request * @sense_bytes: size of sense data to copy or 0 * * This function is used to send a scsi command down to a target device * as part of the error recovery process. See also scsi_eh_prep_cmnd() above. * * Return value: * SUCCESS or FAILED or NEEDS_RETRY */ static enum scsi_disposition scsi_send_eh_cmnd(struct scsi_cmnd *scmd, unsigned char *cmnd, int cmnd_size, int timeout, unsigned sense_bytes) { struct scsi_device *sdev = scmd->device; struct Scsi_Host *shost = sdev->host; DECLARE_COMPLETION_ONSTACK(done); unsigned long timeleft = timeout, delay; struct scsi_eh_save ses; const unsigned long stall_for = msecs_to_jiffies(100); int rtn; retry: scsi_eh_prep_cmnd(scmd, &ses, cmnd, cmnd_size, sense_bytes); shost->eh_action = &done; scsi_log_send(scmd); scmd->submitter = SUBMITTED_BY_SCSI_ERROR_HANDLER; scmd->flags |= SCMD_LAST; /* * Lock sdev->state_mutex to avoid that scsi_device_quiesce() can * change the SCSI device state after we have examined it and before * .queuecommand() is called. */ mutex_lock(&sdev->state_mutex); while (sdev->sdev_state == SDEV_BLOCK && timeleft > 0) { mutex_unlock(&sdev->state_mutex); SCSI_LOG_ERROR_RECOVERY(5, sdev_printk(KERN_DEBUG, sdev, "%s: state %d <> %d\n", __func__, sdev->sdev_state, SDEV_BLOCK)); delay = min(timeleft, stall_for); timeleft -= delay; msleep(jiffies_to_msecs(delay)); mutex_lock(&sdev->state_mutex); } if (sdev->sdev_state != SDEV_BLOCK) rtn = shost->hostt->queuecommand(shost, scmd); else rtn = FAILED; mutex_unlock(&sdev->state_mutex); if (rtn) { if (timeleft > stall_for) { scsi_eh_restore_cmnd(scmd, &ses); timeleft -= stall_for; msleep(jiffies_to_msecs(stall_for)); goto retry; } /* signal not to enter either branch of the if () below */ timeleft = 0; rtn = FAILED; } else { timeleft = wait_for_completion_timeout(&done, timeout); rtn = SUCCESS; } shost->eh_action = NULL; scsi_log_completion(scmd, rtn); SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s timeleft: %ld\n", __func__, timeleft)); /* * If there is time left scsi_eh_done got called, and we will examine * the actual status codes to see whether the command actually did * complete normally, else if we have a zero return and no time left, * the command must still be pending, so abort it and return FAILED. * If we never actually managed to issue the command, because * ->queuecommand() kept returning non zero, use the rtn = FAILED * value above (so don't execute either branch of the if) */ if (timeleft) { rtn = scsi_eh_completed_normally(scmd); SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: scsi_eh_completed_normally %x\n", __func__, rtn)); switch (rtn) { case SUCCESS: case NEEDS_RETRY: case FAILED: break; case ADD_TO_MLQUEUE: rtn = NEEDS_RETRY; break; default: rtn = FAILED; break; } } else if (rtn != FAILED) { scsi_abort_eh_cmnd(scmd); rtn = FAILED; } scsi_eh_restore_cmnd(scmd, &ses); return rtn; } /** * scsi_request_sense - Request sense data from a particular target. * @scmd: SCSI cmd for request sense. * * Notes: * Some hosts automatically obtain this information, others require * that we obtain it on our own. This function will *not* return until * the command either times out, or it completes. */ static enum scsi_disposition scsi_request_sense(struct scsi_cmnd *scmd) { return scsi_send_eh_cmnd(scmd, NULL, 0, scmd->device->eh_timeout, ~0); } static enum scsi_disposition scsi_eh_action(struct scsi_cmnd *scmd, enum scsi_disposition rtn) { if (!blk_rq_is_passthrough(scsi_cmd_to_rq(scmd))) { struct scsi_driver *sdrv = scsi_cmd_to_driver(scmd); if (sdrv->eh_action) rtn = sdrv->eh_action(scmd, rtn); } return rtn; } /** * scsi_eh_finish_cmd - Handle a cmd that eh is finished with. * @scmd: Original SCSI cmd that eh has finished. * @done_q: Queue for processed commands. * * Notes: * We don't want to use the normal command completion while we are are * still handling errors - it may cause other commands to be queued, * and that would disturb what we are doing. Thus we really want to * keep a list of pending commands for final completion, and once we * are ready to leave error handling we handle completion for real. */ void scsi_eh_finish_cmd(struct scsi_cmnd *scmd, struct list_head *done_q) { list_move_tail(&scmd->eh_entry, done_q); } EXPORT_SYMBOL(scsi_eh_finish_cmd); /** * scsi_eh_get_sense - Get device sense data. * @work_q: Queue of commands to process. * @done_q: Queue of processed commands. * * Description: * See if we need to request sense information. if so, then get it * now, so we have a better idea of what to do. * * Notes: * This has the unfortunate side effect that if a shost adapter does * not automatically request sense information, we end up shutting * it down before we request it. * * All drivers should request sense information internally these days, * so for now all I have to say is tough noogies if you end up in here. * * XXX: Long term this code should go away, but that needs an audit of * all LLDDs first. */ int scsi_eh_get_sense(struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *next; struct Scsi_Host *shost; enum scsi_disposition rtn; /* * If SCSI_EH_ABORT_SCHEDULED has been set, it is timeout IO, * should not get sense. */ list_for_each_entry_safe(scmd, next, work_q, eh_entry) { if ((scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) || SCSI_SENSE_VALID(scmd)) continue; shost = scmd->device->host; if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: skip request sense, past eh deadline\n", current->comm)); break; } if (!scsi_status_is_check_condition(scmd->result)) /* * don't request sense if there's no check condition * status because the error we're processing isn't one * that has a sense code (and some devices get * confused by sense requests out of the blue) */ continue; SCSI_LOG_ERROR_RECOVERY(2, scmd_printk(KERN_INFO, scmd, "%s: requesting sense\n", current->comm)); rtn = scsi_request_sense(scmd); if (rtn != SUCCESS) continue; SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "sense requested, result %x\n", scmd->result)); SCSI_LOG_ERROR_RECOVERY(3, scsi_print_sense(scmd)); rtn = scsi_decide_disposition(scmd); /* * if the result was normal, then just pass it along to the * upper level. */ if (rtn == SUCCESS) /* * We don't want this command reissued, just finished * with the sense data, so set retries to the max * allowed to ensure it won't get reissued. If the user * has requested infinite retries, we also want to * finish this command, so force completion by setting * retries and allowed to the same value. */ if (scmd->allowed == SCSI_CMD_RETRIES_NO_LIMIT) scmd->retries = scmd->allowed = 1; else scmd->retries = scmd->allowed; else if (rtn != NEEDS_RETRY) continue; scsi_eh_finish_cmd(scmd, done_q); } return list_empty(work_q); } EXPORT_SYMBOL_GPL(scsi_eh_get_sense); /** * scsi_eh_tur - Send TUR to device. * @scmd: &scsi_cmnd to send TUR * * Return value: * 0 - Device is ready. 1 - Device NOT ready. */ static int scsi_eh_tur(struct scsi_cmnd *scmd) { static unsigned char tur_command[6] = {TEST_UNIT_READY, 0, 0, 0, 0, 0}; int retry_cnt = 1; enum scsi_disposition rtn; retry_tur: rtn = scsi_send_eh_cmnd(scmd, tur_command, 6, scmd->device->eh_timeout, 0); SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s return: %x\n", __func__, rtn)); switch (rtn) { case NEEDS_RETRY: if (retry_cnt--) goto retry_tur; fallthrough; case SUCCESS: return 0; default: return 1; } } /** * scsi_eh_test_devices - check if devices are responding from error recovery. * @cmd_list: scsi commands in error recovery. * @work_q: queue for commands which still need more error recovery * @done_q: queue for commands which are finished * @try_stu: boolean on if a STU command should be tried in addition to TUR. * * Decription: * Tests if devices are in a working state. Commands to devices now in * a working state are sent to the done_q while commands to devices which * are still failing to respond are returned to the work_q for more * processing. **/ static int scsi_eh_test_devices(struct list_head *cmd_list, struct list_head *work_q, struct list_head *done_q, int try_stu) { struct scsi_cmnd *scmd, *next; struct scsi_device *sdev; int finish_cmds; while (!list_empty(cmd_list)) { scmd = list_entry(cmd_list->next, struct scsi_cmnd, eh_entry); sdev = scmd->device; if (!try_stu) { if (scsi_host_eh_past_deadline(sdev->host)) { /* Push items back onto work_q */ list_splice_init(cmd_list, work_q); SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: skip test device, past eh deadline", current->comm)); break; } } finish_cmds = !scsi_device_online(scmd->device) || (try_stu && !scsi_eh_try_stu(scmd) && !scsi_eh_tur(scmd)) || !scsi_eh_tur(scmd); list_for_each_entry_safe(scmd, next, cmd_list, eh_entry) if (scmd->device == sdev) { if (finish_cmds && (try_stu || scsi_eh_action(scmd, SUCCESS) == SUCCESS)) scsi_eh_finish_cmd(scmd, done_q); else list_move_tail(&scmd->eh_entry, work_q); } } return list_empty(work_q); } /** * scsi_eh_try_stu - Send START_UNIT to device. * @scmd: &scsi_cmnd to send START_UNIT * * Return value: * 0 - Device is ready. 1 - Device NOT ready. */ static int scsi_eh_try_stu(struct scsi_cmnd *scmd) { static unsigned char stu_command[6] = {START_STOP, 0, 0, 0, 1, 0}; if (scmd->device->allow_restart) { int i; enum scsi_disposition rtn = NEEDS_RETRY; for (i = 0; rtn == NEEDS_RETRY && i < 2; i++) rtn = scsi_send_eh_cmnd(scmd, stu_command, 6, scmd->device->eh_timeout, 0); if (rtn == SUCCESS) return 0; } return 1; } /** * scsi_eh_stu - send START_UNIT if needed * @shost: &scsi host being recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. * * Notes: * If commands are failing due to not ready, initializing command required, * try revalidating the device, which will end up sending a start unit. */ static int scsi_eh_stu(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *stu_scmd, *next; struct scsi_device *sdev; shost_for_each_device(sdev, shost) { if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: skip START_UNIT, past eh deadline\n", current->comm)); scsi_device_put(sdev); break; } stu_scmd = NULL; list_for_each_entry(scmd, work_q, eh_entry) if (scmd->device == sdev && SCSI_SENSE_VALID(scmd) && scsi_check_sense(scmd) == FAILED ) { stu_scmd = scmd; break; } if (!stu_scmd) continue; SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: Sending START_UNIT\n", current->comm)); if (!scsi_eh_try_stu(stu_scmd)) { if (!scsi_device_online(sdev) || !scsi_eh_tur(stu_scmd)) { list_for_each_entry_safe(scmd, next, work_q, eh_entry) { if (scmd->device == sdev && scsi_eh_action(scmd, SUCCESS) == SUCCESS) scsi_eh_finish_cmd(scmd, done_q); } } } else { SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: START_UNIT failed\n", current->comm)); } } return list_empty(work_q); } /** * scsi_eh_bus_device_reset - send bdr if needed * @shost: scsi host being recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. * * Notes: * Try a bus device reset. Still, look to see whether we have multiple * devices that are jammed or not - if we have multiple devices, it * makes no sense to try bus_device_reset - we really would need to try * a bus_reset instead. */ static int scsi_eh_bus_device_reset(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *bdr_scmd, *next; struct scsi_device *sdev; enum scsi_disposition rtn; shost_for_each_device(sdev, shost) { if (scsi_host_eh_past_deadline(shost)) { SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: skip BDR, past eh deadline\n", current->comm)); scsi_device_put(sdev); break; } bdr_scmd = NULL; list_for_each_entry(scmd, work_q, eh_entry) if (scmd->device == sdev) { bdr_scmd = scmd; break; } if (!bdr_scmd) continue; SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: Sending BDR\n", current->comm)); rtn = scsi_try_bus_device_reset(bdr_scmd); if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { if (!scsi_device_online(sdev) || rtn == FAST_IO_FAIL || !scsi_eh_tur(bdr_scmd)) { list_for_each_entry_safe(scmd, next, work_q, eh_entry) { if (scmd->device == sdev && scsi_eh_action(scmd, rtn) != FAILED) scsi_eh_finish_cmd(scmd, done_q); } } } else { SCSI_LOG_ERROR_RECOVERY(3, sdev_printk(KERN_INFO, sdev, "%s: BDR failed\n", current->comm)); } } return list_empty(work_q); } /** * scsi_eh_target_reset - send target reset if needed * @shost: scsi host being recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. * * Notes: * Try a target reset. */ static int scsi_eh_target_reset(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { LIST_HEAD(tmp_list); LIST_HEAD(check_list); list_splice_init(work_q, &tmp_list); while (!list_empty(&tmp_list)) { struct scsi_cmnd *next, *scmd; enum scsi_disposition rtn; unsigned int id; if (scsi_host_eh_past_deadline(shost)) { /* push back on work queue for further processing */ list_splice_init(&check_list, work_q); list_splice_init(&tmp_list, work_q); SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Skip target reset, past eh deadline\n", current->comm)); return list_empty(work_q); } scmd = list_entry(tmp_list.next, struct scsi_cmnd, eh_entry); id = scmd_id(scmd); SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Sending target reset to target %d\n", current->comm, id)); rtn = scsi_try_target_reset(scmd); if (rtn != SUCCESS && rtn != FAST_IO_FAIL) SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Target reset failed" " target: %d\n", current->comm, id)); list_for_each_entry_safe(scmd, next, &tmp_list, eh_entry) { if (scmd_id(scmd) != id) continue; if (rtn == SUCCESS) list_move_tail(&scmd->eh_entry, &check_list); else if (rtn == FAST_IO_FAIL) scsi_eh_finish_cmd(scmd, done_q); else /* push back on work queue for further processing */ list_move(&scmd->eh_entry, work_q); } } return scsi_eh_test_devices(&check_list, work_q, done_q, 0); } /** * scsi_eh_bus_reset - send a bus reset * @shost: &scsi host being recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. */ static int scsi_eh_bus_reset(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *chan_scmd, *next; LIST_HEAD(check_list); unsigned int channel; enum scsi_disposition rtn; /* * we really want to loop over the various channels, and do this on * a channel by channel basis. we should also check to see if any * of the failed commands are on soft_reset devices, and if so, skip * the reset. */ for (channel = 0; channel <= shost->max_channel; channel++) { if (scsi_host_eh_past_deadline(shost)) { list_splice_init(&check_list, work_q); SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: skip BRST, past eh deadline\n", current->comm)); return list_empty(work_q); } chan_scmd = NULL; list_for_each_entry(scmd, work_q, eh_entry) { if (channel == scmd_channel(scmd)) { chan_scmd = scmd; break; /* * FIXME add back in some support for * soft_reset devices. */ } } if (!chan_scmd) continue; SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Sending BRST chan: %d\n", current->comm, channel)); rtn = scsi_try_bus_reset(chan_scmd); if (rtn == SUCCESS || rtn == FAST_IO_FAIL) { list_for_each_entry_safe(scmd, next, work_q, eh_entry) { if (channel == scmd_channel(scmd)) { if (rtn == FAST_IO_FAIL) scsi_eh_finish_cmd(scmd, done_q); else list_move_tail(&scmd->eh_entry, &check_list); } } } else { SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: BRST failed chan: %d\n", current->comm, channel)); } } return scsi_eh_test_devices(&check_list, work_q, done_q, 0); } /** * scsi_eh_host_reset - send a host reset * @shost: host to be reset. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. */ static int scsi_eh_host_reset(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *next; LIST_HEAD(check_list); enum scsi_disposition rtn; if (!list_empty(work_q)) { scmd = list_entry(work_q->next, struct scsi_cmnd, eh_entry); SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: Sending HRST\n", current->comm)); rtn = scsi_try_host_reset(scmd); if (rtn == SUCCESS) { list_splice_init(work_q, &check_list); } else if (rtn == FAST_IO_FAIL) { list_for_each_entry_safe(scmd, next, work_q, eh_entry) { scsi_eh_finish_cmd(scmd, done_q); } } else { SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "%s: HRST failed\n", current->comm)); } } return scsi_eh_test_devices(&check_list, work_q, done_q, 1); } /** * scsi_eh_offline_sdevs - offline scsi devices that fail to recover * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. */ static void scsi_eh_offline_sdevs(struct list_head *work_q, struct list_head *done_q) { struct scsi_cmnd *scmd, *next; struct scsi_device *sdev; list_for_each_entry_safe(scmd, next, work_q, eh_entry) { sdev_printk(KERN_INFO, scmd->device, "Device offlined - " "not ready after error recovery\n"); sdev = scmd->device; mutex_lock(&sdev->state_mutex); scsi_device_set_state(sdev, SDEV_OFFLINE); mutex_unlock(&sdev->state_mutex); scsi_eh_finish_cmd(scmd, done_q); } return; } /** * scsi_noretry_cmd - determine if command should be failed fast * @scmd: SCSI cmd to examine. */ bool scsi_noretry_cmd(struct scsi_cmnd *scmd) { struct request *req = scsi_cmd_to_rq(scmd); switch (host_byte(scmd->result)) { case DID_OK: break; case DID_TIME_OUT: goto check_type; case DID_BUS_BUSY: return !!(req->cmd_flags & REQ_FAILFAST_TRANSPORT); case DID_PARITY: return !!(req->cmd_flags & REQ_FAILFAST_DEV); case DID_ERROR: if (get_status_byte(scmd) == SAM_STAT_RESERVATION_CONFLICT) return false; fallthrough; case DID_SOFT_ERROR: return !!(req->cmd_flags & REQ_FAILFAST_DRIVER); } /* Never retry commands aborted due to a duration limit timeout */ if (scsi_ml_byte(scmd->result) == SCSIML_STAT_DL_TIMEOUT) return true; if (!scsi_status_is_check_condition(scmd->result)) return false; check_type: /* * assume caller has checked sense and determined * the check condition was retryable. */ if (req->cmd_flags & REQ_FAILFAST_DEV || blk_rq_is_passthrough(req)) return true; return false; } /** * scsi_decide_disposition - Disposition a cmd on return from LLD. * @scmd: SCSI cmd to examine. * * Notes: * This is *only* called when we are examining the status after sending * out the actual data command. any commands that are queued for error * recovery (e.g. test_unit_ready) do *not* come through here. * * When this routine returns failed, it means the error handler thread * is woken. In cases where the error code indicates an error that * doesn't require the error handler read (i.e. we don't need to * abort/reset), this function should return SUCCESS. */ enum scsi_disposition scsi_decide_disposition(struct scsi_cmnd *scmd) { enum scsi_disposition rtn; /* * if the device is offline, then we clearly just pass the result back * up to the top level. */ if (!scsi_device_online(scmd->device)) { SCSI_LOG_ERROR_RECOVERY(5, scmd_printk(KERN_INFO, scmd, "%s: device offline - report as SUCCESS\n", __func__)); return SUCCESS; } /* * first check the host byte, to see if there is anything in there * that would indicate what we need to do. */ switch (host_byte(scmd->result)) { case DID_PASSTHROUGH: /* * no matter what, pass this through to the upper layer. * nuke this special code so that it looks like we are saying * did_ok. */ scmd->result &= 0xff00ffff; return SUCCESS; case DID_OK: /* * looks good. drop through, and check the next byte. */ break; case DID_ABORT: if (scmd->eh_eflags & SCSI_EH_ABORT_SCHEDULED) { set_host_byte(scmd, DID_TIME_OUT); return SUCCESS; } fallthrough; case DID_NO_CONNECT: case DID_BAD_TARGET: /* * note - this means that we just report the status back * to the top level driver, not that we actually think * that it indicates SUCCESS. */ return SUCCESS; case DID_SOFT_ERROR: /* * when the low level driver returns did_soft_error, * it is responsible for keeping an internal retry counter * in order to avoid endless loops (db) */ goto maybe_retry; case DID_IMM_RETRY: return NEEDS_RETRY; case DID_REQUEUE: return ADD_TO_MLQUEUE; case DID_TRANSPORT_DISRUPTED: /* * LLD/transport was disrupted during processing of the IO. * The transport class is now blocked/blocking, * and the transport will decide what to do with the IO * based on its timers and recovery capablilities if * there are enough retries. */ goto maybe_retry; case DID_TRANSPORT_FAILFAST: /* * The transport decided to failfast the IO (most likely * the fast io fail tmo fired), so send IO directly upwards. */ return SUCCESS; case DID_TRANSPORT_MARGINAL: /* * caller has decided not to do retries on * abort success, so send IO directly upwards */ return SUCCESS; case DID_ERROR: if (get_status_byte(scmd) == SAM_STAT_RESERVATION_CONFLICT) /* * execute reservation conflict processing code * lower down */ break; fallthrough; case DID_BUS_BUSY: case DID_PARITY: goto maybe_retry; case DID_TIME_OUT: /* * when we scan the bus, we get timeout messages for * these commands if there is no device available. * other hosts report did_no_connect for the same thing. */ if ((scmd->cmnd[0] == TEST_UNIT_READY || scmd->cmnd[0] == INQUIRY)) { return SUCCESS; } else { return FAILED; } case DID_RESET: return SUCCESS; default: return FAILED; } /* * check the status byte to see if this indicates anything special. */ switch (get_status_byte(scmd)) { case SAM_STAT_TASK_SET_FULL: scsi_handle_queue_full(scmd->device); /* * the case of trying to send too many commands to a * tagged queueing device. */ fallthrough; case SAM_STAT_BUSY: /* * device can't talk to us at the moment. Should only * occur (SAM-3) when the task queue is empty, so will cause * the empty queue handling to trigger a stall in the * device. */ return ADD_TO_MLQUEUE; case SAM_STAT_GOOD: if (scmd->cmnd[0] == REPORT_LUNS) scmd->device->sdev_target->expecting_lun_change = 0; scsi_handle_queue_ramp_up(scmd->device); if (scmd->sense_buffer && SCSI_SENSE_VALID(scmd)) /* * If we have sense data, call scsi_check_sense() in * order to set the correct SCSI ML byte (if any). * No point in checking the return value, since the * command has already completed successfully. */ scsi_check_sense(scmd); fallthrough; case SAM_STAT_COMMAND_TERMINATED: return SUCCESS; case SAM_STAT_TASK_ABORTED: goto maybe_retry; case SAM_STAT_CHECK_CONDITION: rtn = scsi_check_sense(scmd); if (rtn == NEEDS_RETRY) goto maybe_retry; /* if rtn == FAILED, we have no sense information; * returning FAILED will wake the error handler thread * to collect the sense and redo the decide * disposition */ return rtn; case SAM_STAT_CONDITION_MET: case SAM_STAT_INTERMEDIATE: case SAM_STAT_INTERMEDIATE_CONDITION_MET: case SAM_STAT_ACA_ACTIVE: /* * who knows? FIXME(eric) */ return SUCCESS; case SAM_STAT_RESERVATION_CONFLICT: sdev_printk(KERN_INFO, scmd->device, "reservation conflict\n"); set_scsi_ml_byte(scmd, SCSIML_STAT_RESV_CONFLICT); return SUCCESS; /* causes immediate i/o error */ } return FAILED; maybe_retry: /* we requeue for retry because the error was retryable, and * the request was not marked fast fail. Note that above, * even if the request is marked fast fail, we still requeue * for queue congestion conditions (QUEUE_FULL or BUSY) */ if (scsi_cmd_retry_allowed(scmd) && !scsi_noretry_cmd(scmd)) { return NEEDS_RETRY; } else { /* * no more retries - report this one back to upper level. */ return SUCCESS; } } static enum rq_end_io_ret eh_lock_door_done(struct request *req, blk_status_t status) { blk_mq_free_request(req); return RQ_END_IO_NONE; } /** * scsi_eh_lock_door - Prevent medium removal for the specified device * @sdev: SCSI device to prevent medium removal * * Locking: * We must be called from process context. * * Notes: * We queue up an asynchronous "ALLOW MEDIUM REMOVAL" request on the * head of the devices request queue, and continue. */ static void scsi_eh_lock_door(struct scsi_device *sdev) { struct scsi_cmnd *scmd; struct request *req; req = scsi_alloc_request(sdev->request_queue, REQ_OP_DRV_IN, 0); if (IS_ERR(req)) return; scmd = blk_mq_rq_to_pdu(req); scmd->cmnd[0] = ALLOW_MEDIUM_REMOVAL; scmd->cmnd[1] = 0; scmd->cmnd[2] = 0; scmd->cmnd[3] = 0; scmd->cmnd[4] = SCSI_REMOVAL_PREVENT; scmd->cmnd[5] = 0; scmd->cmd_len = COMMAND_SIZE(scmd->cmnd[0]); scmd->allowed = 5; req->rq_flags |= RQF_QUIET; req->timeout = 10 * HZ; req->end_io = eh_lock_door_done; blk_execute_rq_nowait(req, true); } /** * scsi_restart_operations - restart io operations to the specified host. * @shost: Host we are restarting. * * Notes: * When we entered the error handler, we blocked all further i/o to * this device. we need to 'reverse' this process. */ static void scsi_restart_operations(struct Scsi_Host *shost) { struct scsi_device *sdev; unsigned long flags; /* * If the door was locked, we need to insert a door lock request * onto the head of the SCSI request queue for the device. There * is no point trying to lock the door of an off-line device. */ shost_for_each_device(sdev, shost) { if (scsi_device_online(sdev) && sdev->was_reset && sdev->locked) { scsi_eh_lock_door(sdev); sdev->was_reset = 0; } } /* * next free up anything directly waiting upon the host. this * will be requests for character device operations, and also for * ioctls to queued block devices. */ SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "waking up host to restart\n")); spin_lock_irqsave(shost->host_lock, flags); if (scsi_host_set_state(shost, SHOST_RUNNING)) if (scsi_host_set_state(shost, SHOST_CANCEL)) BUG_ON(scsi_host_set_state(shost, SHOST_DEL)); spin_unlock_irqrestore(shost->host_lock, flags); wake_up(&shost->host_wait); /* * finally we need to re-initiate requests that may be pending. we will * have had everything blocked while error handling is taking place, and * now that error recovery is done, we will need to ensure that these * requests are started. */ scsi_run_host_queues(shost); /* * if eh is active and host_eh_scheduled is pending we need to re-run * recovery. we do this check after scsi_run_host_queues() to allow * everything pent up since the last eh run a chance to make forward * progress before we sync again. Either we'll immediately re-run * recovery or scsi_device_unbusy() will wake us again when these * pending commands complete. */ spin_lock_irqsave(shost->host_lock, flags); if (shost->host_eh_scheduled) if (scsi_host_set_state(shost, SHOST_RECOVERY)) WARN_ON(scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY)); spin_unlock_irqrestore(shost->host_lock, flags); } /** * scsi_eh_ready_devs - check device ready state and recover if not. * @shost: host to be recovered. * @work_q: &list_head for pending commands. * @done_q: &list_head for processed commands. */ void scsi_eh_ready_devs(struct Scsi_Host *shost, struct list_head *work_q, struct list_head *done_q) { if (!scsi_eh_stu(shost, work_q, done_q)) if (!scsi_eh_bus_device_reset(shost, work_q, done_q)) if (!scsi_eh_target_reset(shost, work_q, done_q)) if (!scsi_eh_bus_reset(shost, work_q, done_q)) if (!scsi_eh_host_reset(shost, work_q, done_q)) scsi_eh_offline_sdevs(work_q, done_q); } EXPORT_SYMBOL_GPL(scsi_eh_ready_devs); /** * scsi_eh_flush_done_q - finish processed commands or retry them. * @done_q: list_head of processed commands. */ void scsi_eh_flush_done_q(struct list_head *done_q) { struct scsi_cmnd *scmd, *next; list_for_each_entry_safe(scmd, next, done_q, eh_entry) { struct scsi_device *sdev = scmd->device; list_del_init(&scmd->eh_entry); if (scsi_device_online(sdev) && !scsi_noretry_cmd(scmd) && scsi_cmd_retry_allowed(scmd) && scsi_eh_should_retry_cmd(scmd)) { SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: flush retry cmd\n", current->comm)); scsi_queue_insert(scmd, SCSI_MLQUEUE_EH_RETRY); blk_mq_kick_requeue_list(sdev->request_queue); } else { /* * If just we got sense for the device (called * scsi_eh_get_sense), scmd->result is already * set, do not set DID_TIME_OUT. */ if (!scmd->result && !(scmd->flags & SCMD_FORCE_EH_SUCCESS)) scmd->result |= (DID_TIME_OUT << 16); SCSI_LOG_ERROR_RECOVERY(3, scmd_printk(KERN_INFO, scmd, "%s: flush finish cmd\n", current->comm)); scsi_finish_command(scmd); } } } EXPORT_SYMBOL(scsi_eh_flush_done_q); /** * scsi_unjam_host - Attempt to fix a host which has a cmd that failed. * @shost: Host to unjam. * * Notes: * When we come in here, we *know* that all commands on the bus have * either completed, failed or timed out. we also know that no further * commands are being sent to the host, so things are relatively quiet * and we have freedom to fiddle with things as we wish. * * This is only the *default* implementation. it is possible for * individual drivers to supply their own version of this function, and * if the maintainer wishes to do this, it is strongly suggested that * this function be taken as a template and modified. this function * was designed to correctly handle problems for about 95% of the * different cases out there, and it should always provide at least a * reasonable amount of error recovery. * * Any command marked 'failed' or 'timeout' must eventually have * scsi_finish_cmd() called for it. we do all of the retry stuff * here, so when we restart the host after we return it should have an * empty queue. */ static void scsi_unjam_host(struct Scsi_Host *shost) { unsigned long flags; LIST_HEAD(eh_work_q); LIST_HEAD(eh_done_q); spin_lock_irqsave(shost->host_lock, flags); list_splice_init(&shost->eh_cmd_q, &eh_work_q); spin_unlock_irqrestore(shost->host_lock, flags); SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q)); if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q)) scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q); spin_lock_irqsave(shost->host_lock, flags); if (shost->eh_deadline != -1) shost->last_reset = 0; spin_unlock_irqrestore(shost->host_lock, flags); scsi_eh_flush_done_q(&eh_done_q); } /** * scsi_error_handler - SCSI error handler thread * @data: Host for which we are running. * * Notes: * This is the main error handling loop. This is run as a kernel thread * for every SCSI host and handles all error handling activity. */ int scsi_error_handler(void *data) { struct Scsi_Host *shost = data; /* * We use TASK_INTERRUPTIBLE so that the thread is not * counted against the load average as a running process. * We never actually get interrupted because kthread_run * disables signal delivery for the created thread. */ while (true) { /* * The sequence in kthread_stop() sets the stop flag first * then wakes the process. To avoid missed wakeups, the task * should always be in a non running state before the stop * flag is checked */ set_current_state(TASK_INTERRUPTIBLE); if (kthread_should_stop()) break; if ((shost->host_failed == 0 && shost->host_eh_scheduled == 0) || shost->host_failed != scsi_host_busy(shost)) { SCSI_LOG_ERROR_RECOVERY(1, shost_printk(KERN_INFO, shost, "scsi_eh_%d: sleeping\n", shost->host_no)); schedule(); continue; } __set_current_state(TASK_RUNNING); SCSI_LOG_ERROR_RECOVERY(1, shost_printk(KERN_INFO, shost, "scsi_eh_%d: waking up %d/%d/%d\n", shost->host_no, shost->host_eh_scheduled, shost->host_failed, scsi_host_busy(shost))); /* * We have a host that is failing for some reason. Figure out * what we need to do to get it up and online again (if we can). * If we fail, we end up taking the thing offline. */ if (!shost->eh_noresume && scsi_autopm_get_host(shost) != 0) { SCSI_LOG_ERROR_RECOVERY(1, shost_printk(KERN_ERR, shost, "scsi_eh_%d: unable to autoresume\n", shost->host_no)); continue; } if (shost->transportt->eh_strategy_handler) shost->transportt->eh_strategy_handler(shost); else scsi_unjam_host(shost); /* All scmds have been handled */ shost->host_failed = 0; /* * Note - if the above fails completely, the action is to take * individual devices offline and flush the queue of any * outstanding requests that may have been pending. When we * restart, we restart any I/O to any other devices on the bus * which are still online. */ scsi_restart_operations(shost); if (!shost->eh_noresume) scsi_autopm_put_host(shost); } __set_current_state(TASK_RUNNING); SCSI_LOG_ERROR_RECOVERY(1, shost_printk(KERN_INFO, shost, "Error handler scsi_eh_%d exiting\n", shost->host_no)); shost->ehandler = NULL; return 0; } /** * scsi_report_bus_reset() - report bus reset observed * * Utility function used by low-level drivers to report that * they have observed a bus reset on the bus being handled. * * @shost: Host in question * @channel: channel on which reset was observed. * * Returns: Nothing * * Lock status: Host lock must be held. * * Notes: This only needs to be called if the reset is one which * originates from an unknown location. Resets originated * by the mid-level itself don't need to call this, but there * should be no harm. * * The main purpose of this is to make sure that a CHECK_CONDITION * is properly treated. */ void scsi_report_bus_reset(struct Scsi_Host *shost, int channel) { struct scsi_device *sdev; __shost_for_each_device(sdev, shost) { if (channel == sdev_channel(sdev)) __scsi_report_device_reset(sdev, NULL); } } EXPORT_SYMBOL(scsi_report_bus_reset); /** * scsi_report_device_reset() - report device reset observed * * Utility function used by low-level drivers to report that * they have observed a device reset on the device being handled. * * @shost: Host in question * @channel: channel on which reset was observed * @target: target on which reset was observed * * Returns: Nothing * * Lock status: Host lock must be held * * Notes: This only needs to be called if the reset is one which * originates from an unknown location. Resets originated * by the mid-level itself don't need to call this, but there * should be no harm. * * The main purpose of this is to make sure that a CHECK_CONDITION * is properly treated. */ void scsi_report_device_reset(struct Scsi_Host *shost, int channel, int target) { struct scsi_device *sdev; __shost_for_each_device(sdev, shost) { if (channel == sdev_channel(sdev) && target == sdev_id(sdev)) __scsi_report_device_reset(sdev, NULL); } } EXPORT_SYMBOL(scsi_report_device_reset); /** * scsi_ioctl_reset: explicitly reset a host/bus/target/device * @dev: scsi_device to operate on * @arg: reset type (see sg.h) */ int scsi_ioctl_reset(struct scsi_device *dev, int __user *arg) { struct scsi_cmnd *scmd; struct Scsi_Host *shost = dev->host; struct request *rq; unsigned long flags; int error = 0, val; enum scsi_disposition rtn; if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RAWIO)) return -EACCES; error = get_user(val, arg); if (error) return error; if (scsi_autopm_get_host(shost) < 0) return -EIO; error = -EIO; rq = kzalloc(sizeof(struct request) + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size, GFP_KERNEL); if (!rq) goto out_put_autopm_host; blk_rq_init(NULL, rq); scmd = (struct scsi_cmnd *)(rq + 1); scsi_init_command(dev, scmd); scmd->submitter = SUBMITTED_BY_SCSI_RESET_IOCTL; scmd->flags |= SCMD_LAST; memset(&scmd->sdb, 0, sizeof(scmd->sdb)); scmd->cmd_len = 0; scmd->sc_data_direction = DMA_BIDIRECTIONAL; spin_lock_irqsave(shost->host_lock, flags); shost->tmf_in_progress = 1; spin_unlock_irqrestore(shost->host_lock, flags); switch (val & ~SG_SCSI_RESET_NO_ESCALATE) { case SG_SCSI_RESET_NOTHING: rtn = SUCCESS; break; case SG_SCSI_RESET_DEVICE: rtn = scsi_try_bus_device_reset(scmd); if (rtn == SUCCESS || (val & SG_SCSI_RESET_NO_ESCALATE)) break; fallthrough; case SG_SCSI_RESET_TARGET: rtn = scsi_try_target_reset(scmd); if (rtn == SUCCESS || (val & SG_SCSI_RESET_NO_ESCALATE)) break; fallthrough; case SG_SCSI_RESET_BUS: rtn = scsi_try_bus_reset(scmd); if (rtn == SUCCESS || (val & SG_SCSI_RESET_NO_ESCALATE)) break; fallthrough; case SG_SCSI_RESET_HOST: rtn = scsi_try_host_reset(scmd); if (rtn == SUCCESS) break; fallthrough; default: rtn = FAILED; break; } error = (rtn == SUCCESS) ? 0 : -EIO; spin_lock_irqsave(shost->host_lock, flags); shost->tmf_in_progress = 0; spin_unlock_irqrestore(shost->host_lock, flags); /* * be sure to wake up anyone who was sleeping or had their queue * suspended while we performed the TMF. */ SCSI_LOG_ERROR_RECOVERY(3, shost_printk(KERN_INFO, shost, "waking up host to restart after TMF\n")); wake_up(&shost->host_wait); scsi_run_host_queues(shost); kfree(rq); out_put_autopm_host: scsi_autopm_put_host(shost); return error; } bool scsi_command_normalize_sense(const struct scsi_cmnd *cmd, struct scsi_sense_hdr *sshdr) { return scsi_normalize_sense(cmd->sense_buffer, SCSI_SENSE_BUFFERSIZE, sshdr); } EXPORT_SYMBOL(scsi_command_normalize_sense); /** * scsi_get_sense_info_fld - get information field from sense data (either fixed or descriptor format) * @sense_buffer: byte array of sense data * @sb_len: number of valid bytes in sense_buffer * @info_out: pointer to 64 integer where 8 or 4 byte information * field will be placed if found. * * Return value: * true if information field found, false if not found. */ bool scsi_get_sense_info_fld(const u8 *sense_buffer, int sb_len, u64 *info_out) { const u8 * ucp; if (sb_len < 7) return false; switch (sense_buffer[0] & 0x7f) { case 0x70: case 0x71: if (sense_buffer[0] & 0x80) { *info_out = get_unaligned_be32(&sense_buffer[3]); return true; } return false; case 0x72: case 0x73: ucp = scsi_sense_desc_find(sense_buffer, sb_len, 0 /* info desc */); if (ucp && (0xa == ucp[1])) { *info_out = get_unaligned_be64(&ucp[4]); return true; } return false; default: return false; } } EXPORT_SYMBOL(scsi_get_sense_info_fld);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MLD_H #define LINUX_MLD_H #include <linux/in6.h> #include <linux/icmpv6.h> /* MLDv1 Query/Report/Done */ struct mld_msg { struct icmp6hdr mld_hdr; struct in6_addr mld_mca; }; #define mld_type mld_hdr.icmp6_type #define mld_code mld_hdr.icmp6_code #define mld_cksum mld_hdr.icmp6_cksum #define mld_maxdelay mld_hdr.icmp6_maxdelay #define mld_reserved mld_hdr.icmp6_dataun.un_data16[1] /* Multicast Listener Discovery version 2 headers */ /* MLDv2 Report */ struct mld2_grec { __u8 grec_type; __u8 grec_auxwords; __be16 grec_nsrcs; struct in6_addr grec_mca; struct in6_addr grec_src[]; }; struct mld2_report { struct icmp6hdr mld2r_hdr; struct mld2_grec mld2r_grec[]; }; #define mld2r_type mld2r_hdr.icmp6_type #define mld2r_resv1 mld2r_hdr.icmp6_code #define mld2r_cksum mld2r_hdr.icmp6_cksum #define mld2r_resv2 mld2r_hdr.icmp6_dataun.un_data16[0] #define mld2r_ngrec mld2r_hdr.icmp6_dataun.un_data16[1] /* MLDv2 Query */ struct mld2_query { struct icmp6hdr mld2q_hdr; struct in6_addr mld2q_mca; #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 mld2q_qrv:3, mld2q_suppress:1, mld2q_resv2:4; #elif defined(__BIG_ENDIAN_BITFIELD) __u8 mld2q_resv2:4, mld2q_suppress:1, mld2q_qrv:3; #else #error "Please fix <asm/byteorder.h>" #endif __u8 mld2q_qqic; __be16 mld2q_nsrcs; struct in6_addr mld2q_srcs[]; }; #define mld2q_type mld2q_hdr.icmp6_type #define mld2q_code mld2q_hdr.icmp6_code #define mld2q_cksum mld2q_hdr.icmp6_cksum #define mld2q_mrc mld2q_hdr.icmp6_maxdelay #define mld2q_resv1 mld2q_hdr.icmp6_dataun.un_data16[1] /* RFC3810, 5.1.3. Maximum Response Code: * * If Maximum Response Code >= 32768, Maximum Response Code represents a * floating-point value as follows: * * 0 1 2 3 4 5 6 7 8 9 A B C D E F * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ #define MLDV2_MRC_EXP(value) (((value) >> 12) & 0x0007) #define MLDV2_MRC_MAN(value) ((value) & 0x0fff) /* RFC3810, 5.1.9. QQIC (Querier's Query Interval Code): * * If QQIC >= 128, QQIC represents a floating-point value as follows: * * 0 1 2 3 4 5 6 7 * +-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+ */ #define MLDV2_QQIC_EXP(value) (((value) >> 4) & 0x07) #define MLDV2_QQIC_MAN(value) ((value) & 0x0f) #define MLD_EXP_MIN_LIMIT 32768UL #define MLDV1_MRD_MAX_COMPAT (MLD_EXP_MIN_LIMIT - 1) #define MLD_MAX_QUEUE 8 #define MLD_MAX_SKBS 32 static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2) { /* RFC3810, 5.1.3. Maximum Response Code */ unsigned long ret, mc_mrc = ntohs(mlh2->mld2q_mrc); if (mc_mrc < MLD_EXP_MIN_LIMIT) { ret = mc_mrc; } else { unsigned long mc_man, mc_exp; mc_exp = MLDV2_MRC_EXP(mc_mrc); mc_man = MLDV2_MRC_MAN(mc_mrc); ret = (mc_man | 0x1000) << (mc_exp + 3); } return ret; } #endif
184 82 82 48 238 286 293 5 288 286 1 287 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/fsync.c * * Copyright (C) 1993 Stephen Tweedie (sct@redhat.com) * from * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * from * linux/fs/minix/truncate.c Copyright (C) 1991, 1992 Linus Torvalds * * ext4fs fsync primitive * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * * Removed unnecessary code duplication for little endian machines * and excessive __inline__s. * Andi Kleen, 1997 * * Major simplications and cleanup - we only need to do the metadata, because * we can depend on generic_block_fdatasync() to sync the data blocks. */ #include <linux/time.h> #include <linux/fs.h> #include <linux/sched.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" #include <trace/events/ext4.h> /* * If we're not journaling and this is a just-created file, we have to * sync our parent directory (if it was freshly created) since * otherwise it will only be written by writeback, leaving a huge * window during which a crash may lose the file. This may apply for * the parent directory's parent as well, and so on recursively, if * they are also freshly created. */ static int ext4_sync_parent(struct inode *inode) { struct dentry *dentry, *next; int ret = 0; if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) return 0; dentry = d_find_any_alias(inode); if (!dentry) return 0; while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); next = dget_parent(dentry); dput(dentry); dentry = next; inode = dentry->d_inode; /* * The directory inode may have gone through rmdir by now. But * the inode itself and its blocks are still allocated (we hold * a reference to the inode via its dentry), so it didn't go * through ext4_evict_inode()) and so we are safe to flush * metadata blocks and the inode. */ ret = sync_mapping_buffers(inode->i_mapping); if (ret) break; ret = sync_inode_metadata(inode, 1); if (ret) break; } dput(dentry); return ret; } static int ext4_fsync_nojournal(struct file *file, loff_t start, loff_t end, int datasync, bool *needs_barrier) { struct inode *inode = file->f_inode; int ret; ret = generic_buffers_fsync_noflush(file, start, end, datasync); if (!ret) ret = ext4_sync_parent(inode); if (test_opt(inode->i_sb, BARRIER)) *needs_barrier = true; return ret; } static int ext4_fsync_journal(struct inode *inode, bool datasync, bool *needs_barrier) { struct ext4_inode_info *ei = EXT4_I(inode); journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; /* * Fastcommit does not really support fsync on directories or other * special files. Force a full commit. */ if (!S_ISREG(inode->i_mode)) return ext4_force_commit(inode->i_sb); if (journal->j_flags & JBD2_BARRIER && !jbd2_trans_will_send_data_barrier(journal, commit_tid)) *needs_barrier = true; return ext4_fc_commit(journal, commit_tid); } /* * akpm: A new design for ext4_sync_file(). * * This is only called from sys_fsync(), sys_fdatasync() and sys_msync(). * There cannot be a transaction open by this task. * Another task could have dirtied this inode. Its data can be in any * state in the journalling system. * * What we do is just kick off a commit and wait on it. This will snapshot the * inode to disk. */ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { int ret = 0, err; bool needs_barrier = false; struct inode *inode = file->f_mapping->host; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) return ret; ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file_enter(file, datasync); if (sb_rdonly(inode->i_sb)) goto out; if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, &needs_barrier); if (needs_barrier) goto issue_flush; goto out; } ret = file_write_and_wait_range(file, start, end); if (ret) goto out; /* * The caller's filemap_fdatawrite()/wait will sync the data. * Metadata is in the journal, we wait for proper transaction to * commit here. */ ret = ext4_fsync_journal(inode, datasync, &needs_barrier); issue_flush: if (needs_barrier) { err = blkdev_issue_flush(inode->i_sb->s_bdev); if (!ret) ret = err; } out: err = file_check_and_advance_wb_err(file); if (ret == 0) ret = err; trace_ext4_sync_file_exit(inode, ret); return ret; }
3215 3767 3944 5 2339 4822 7358 33 25584 1 373 13486 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_ATOMIC_H #define _ASM_X86_ATOMIC_H #include <linux/compiler.h> #include <linux/types.h> #include <asm/alternative.h> #include <asm/cmpxchg.h> #include <asm/rmwcc.h> #include <asm/barrier.h> /* * Atomic operations that C can't guarantee us. Useful for * resource counting etc.. */ static __always_inline int arch_atomic_read(const atomic_t *v) { /* * Note for KASAN: we deliberately don't use READ_ONCE_NOCHECK() here, * it's non-inlined function that increases binary size and stack usage. */ return __READ_ONCE((v)->counter); } static __always_inline void arch_atomic_set(atomic_t *v, int i) { __WRITE_ONCE(v->counter, i); } static __always_inline void arch_atomic_add(int i, atomic_t *v) { asm_inline volatile(LOCK_PREFIX "addl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline void arch_atomic_sub(int i, atomic_t *v) { asm_inline volatile(LOCK_PREFIX "subl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i); } #define arch_atomic_sub_and_test arch_atomic_sub_and_test static __always_inline void arch_atomic_inc(atomic_t *v) { asm_inline volatile(LOCK_PREFIX "incl %0" : "+m" (v->counter) :: "memory"); } #define arch_atomic_inc arch_atomic_inc static __always_inline void arch_atomic_dec(atomic_t *v) { asm_inline volatile(LOCK_PREFIX "decl %0" : "+m" (v->counter) :: "memory"); } #define arch_atomic_dec arch_atomic_dec static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e); } #define arch_atomic_dec_and_test arch_atomic_dec_and_test static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e); } #define arch_atomic_inc_and_test arch_atomic_inc_and_test static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i); } #define arch_atomic_add_negative arch_atomic_add_negative static __always_inline int arch_atomic_add_return(int i, atomic_t *v) { return i + xadd(&v->counter, i); } #define arch_atomic_add_return arch_atomic_add_return #define arch_atomic_sub_return(i, v) arch_atomic_add_return(-(i), v) static __always_inline int arch_atomic_fetch_add(int i, atomic_t *v) { return xadd(&v->counter, i); } #define arch_atomic_fetch_add arch_atomic_fetch_add #define arch_atomic_fetch_sub(i, v) arch_atomic_fetch_add(-(i), v) static __always_inline int arch_atomic_cmpxchg(atomic_t *v, int old, int new) { return arch_cmpxchg(&v->counter, old, new); } #define arch_atomic_cmpxchg arch_atomic_cmpxchg static __always_inline bool arch_atomic_try_cmpxchg(atomic_t *v, int *old, int new) { return arch_try_cmpxchg(&v->counter, old, new); } #define arch_atomic_try_cmpxchg arch_atomic_try_cmpxchg static __always_inline int arch_atomic_xchg(atomic_t *v, int new) { return arch_xchg(&v->counter, new); } #define arch_atomic_xchg arch_atomic_xchg static __always_inline void arch_atomic_and(int i, atomic_t *v) { asm_inline volatile(LOCK_PREFIX "andl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline int arch_atomic_fetch_and(int i, atomic_t *v) { int val = arch_atomic_read(v); do { } while (!arch_atomic_try_cmpxchg(v, &val, val & i)); return val; } #define arch_atomic_fetch_and arch_atomic_fetch_and static __always_inline void arch_atomic_or(int i, atomic_t *v) { asm_inline volatile(LOCK_PREFIX "orl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline int arch_atomic_fetch_or(int i, atomic_t *v) { int val = arch_atomic_read(v); do { } while (!arch_atomic_try_cmpxchg(v, &val, val | i)); return val; } #define arch_atomic_fetch_or arch_atomic_fetch_or static __always_inline void arch_atomic_xor(int i, atomic_t *v) { asm_inline volatile(LOCK_PREFIX "xorl %1, %0" : "+m" (v->counter) : "ir" (i) : "memory"); } static __always_inline int arch_atomic_fetch_xor(int i, atomic_t *v) { int val = arch_atomic_read(v); do { } while (!arch_atomic_try_cmpxchg(v, &val, val ^ i)); return val; } #define arch_atomic_fetch_xor arch_atomic_fetch_xor #ifdef CONFIG_X86_32 # include <asm/atomic64_32.h> #else # include <asm/atomic64_64.h> #endif #endif /* _ASM_X86_ATOMIC_H */
341 11 251 3 106 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions of the Internet Protocol. * * Version: @(#)in.h 1.0.1 04/21/93 * * Authors: Original taken from the GNU Project <netinet/in.h> file. * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IN_H #define _LINUX_IN_H #include <linux/errno.h> #include <uapi/linux/in.h> static inline int proto_ports_offset(int proto) { switch (proto) { case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_DCCP: case IPPROTO_ESP: /* SPI */ case IPPROTO_SCTP: case IPPROTO_UDPLITE: return 0; case IPPROTO_AH: /* SPI */ return 4; default: return -EINVAL; } } static inline bool ipv4_is_loopback(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x7f000000); } static inline bool ipv4_is_multicast(__be32 addr) { return (addr & htonl(0xf0000000)) == htonl(0xe0000000); } static inline bool ipv4_is_local_multicast(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xe0000000); } static inline bool ipv4_is_lbcast(__be32 addr) { /* limited broadcast */ return addr == htonl(INADDR_BROADCAST); } static inline bool ipv4_is_all_snoopers(__be32 addr) { return addr == htonl(INADDR_ALLSNOOPERS_GROUP); } static inline bool ipv4_is_zeronet(__be32 addr) { return (addr == 0); } /* Special-Use IPv4 Addresses (RFC3330) */ static inline bool ipv4_is_private_10(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x0a000000); } static inline bool ipv4_is_private_172(__be32 addr) { return (addr & htonl(0xfff00000)) == htonl(0xac100000); } static inline bool ipv4_is_private_192(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xc0a80000); } static inline bool ipv4_is_linklocal_169(__be32 addr) { return (addr & htonl(0xffff0000)) == htonl(0xa9fe0000); } static inline bool ipv4_is_anycast_6to4(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0586300); } static inline bool ipv4_is_test_192(__be32 addr) { return (addr & htonl(0xffffff00)) == htonl(0xc0000200); } static inline bool ipv4_is_test_198(__be32 addr) { return (addr & htonl(0xfffe0000)) == htonl(0xc6120000); } #endif /* _LINUX_IN_H */
4 11 4 46 49 41 5 1 24 22 782 49 49 2 8 24 7 26 33 788 788 11 11 1 1 1 2 2 4 5 5 1 1 2 2 10 10 1 7 6 2 98 98 12 10 32 32 24 15 17 17 32 33 1 32 24 8 32 9 25 7 43 43 1 10 31 6 15 14 29 4 3 22 25 12 13 13 12 120 84 3 33 33 3 22 8 7 8 30 10 20 30 30 38 1 3 1 35 35 35 33 2 35 26 26 26 26 7 21 10 17 103 7 37 4 7 100 72 39 33 232 231 1 229 2 97 88 9 2 79 1 13 90 2 1 37 28 26 59 132 26 197 192 27 7 1 194 4 197 41 158 184 4 60 127 188 147 41 33 1 182 172 38 6 15 119 199 199 11 11 3 1 1 1 1 7 1 2 1 1 1 215 2 1 3 209 7 2 2 1 1 2 24 2 1 4 17 5 14 2 3 1 9 30 14 16 30 432 431 1 8 8 4 416 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 // SPDX-License-Identifier: GPL-2.0-or-later /* * RAW sockets for IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Adapted from linux/net/ipv4/raw.c * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance) * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/slab.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/icmpv6.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv6.h> #include <linux/skbuff.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/protocol.h> #include <net/ip6_route.h> #include <net/ip6_checksum.h> #include <net/addrconf.h> #include <net/transp_v6.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif #include <linux/mroute6.h> #include <net/raw.h> #include <net/rawv6.h> #include <net/xfrm.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/export.h> #define ICMPV6_HDRLEN 4 /* ICMPv6 header, RFC 4443 Section 2.1 */ struct raw_hashinfo raw_v6_hashinfo; EXPORT_SYMBOL_GPL(raw_v6_hashinfo); bool raw_v6_match(struct net *net, const struct sock *sk, unsigned short num, const struct in6_addr *loc_addr, const struct in6_addr *rmt_addr, int dif, int sdif) { if (inet_sk(sk)->inet_num != num || !net_eq(sock_net(sk), net) || (!ipv6_addr_any(&sk->sk_v6_daddr) && !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) || !raw_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif)) return false; if (ipv6_addr_any(&sk->sk_v6_rcv_saddr) || ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr) || (ipv6_addr_is_multicast(loc_addr) && inet6_mc_check(sk, loc_addr, rmt_addr))) return true; return false; } EXPORT_SYMBOL_GPL(raw_v6_match); /* * 0 - deliver * 1 - block */ static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb) { struct icmp6hdr _hdr; const struct icmp6hdr *hdr; /* We require only the four bytes of the ICMPv6 header, not any * additional bytes of message body in "struct icmp6hdr". */ hdr = skb_header_pointer(skb, skb_transport_offset(skb), ICMPV6_HDRLEN, &_hdr); if (hdr) { const __u32 *data = &raw6_sk(sk)->filter.data[0]; unsigned int type = hdr->icmp6_type; return (data[type >> 5] & (1U << (type & 31))) != 0; } return 1; } #if IS_ENABLED(CONFIG_IPV6_MIP6) typedef int mh_filter_t(struct sock *sock, struct sk_buff *skb); static mh_filter_t __rcu *mh_filter __read_mostly; int rawv6_mh_filter_register(mh_filter_t filter) { rcu_assign_pointer(mh_filter, filter); return 0; } EXPORT_SYMBOL(rawv6_mh_filter_register); int rawv6_mh_filter_unregister(mh_filter_t filter) { RCU_INIT_POINTER(mh_filter, NULL); synchronize_rcu(); return 0; } EXPORT_SYMBOL(rawv6_mh_filter_unregister); #endif /* * demultiplex raw sockets. * (should consider queueing the skb in the sock receive_queue * without calling rawv6.c) * * Caller owns SKB so we must make clones. */ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct net *net = dev_net(skb->dev); const struct in6_addr *saddr; const struct in6_addr *daddr; struct hlist_head *hlist; struct sock *sk; bool delivered = false; __u8 hash; saddr = &ipv6_hdr(skb)->saddr; daddr = saddr + 1; hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { int filtered; if (!raw_v6_match(net, sk, nexthdr, daddr, saddr, inet6_iif(skb), inet6_sdif(skb))) continue; if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { sk_drops_inc(sk); continue; } delivered = true; switch (nexthdr) { case IPPROTO_ICMPV6: filtered = icmpv6_filter(sk, skb); break; #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: { /* XXX: To validate MH only once for each packet, * this is placed here. It should be after checking * xfrm policy, however it doesn't. The checking xfrm * policy is placed in rawv6_rcv() because it is * required for each socket. */ mh_filter_t *filter; filter = rcu_dereference(mh_filter); filtered = filter ? (*filter)(sk, skb) : 0; break; } #endif default: filtered = 0; break; } if (filtered < 0) break; if (filtered == 0) { struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); /* Not releasing hash table! */ if (clone) rawv6_rcv(sk, clone); } } rcu_read_unlock(); return delivered; } bool raw6_local_deliver(struct sk_buff *skb, int nexthdr) { return ipv6_raw_deliver(skb, nexthdr); } /* This cleans up af_inet6 a bit. -DaveM */ static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; __be32 v4addr = 0; int addr_type; int err; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (addr->sin6_family != AF_INET6) return -EINVAL; addr_type = ipv6_addr_type(&addr->sin6_addr); /* Raw sockets are IPv6 only */ if (addr_type == IPV6_ADDR_MAPPED) return -EADDRNOTAVAIL; lock_sock(sk); err = -EINVAL; if (sk->sk_state != TCP_CLOSE) goto out; rcu_read_lock(); /* Check if the address belongs to the host. */ if (addr_type != IPV6_ADDR_ANY) { struct net_device *dev = NULL; if (__ipv6_addr_needs_scope_id(addr_type)) { if (addr_len >= sizeof(struct sockaddr_in6) && addr->sin6_scope_id) { /* Override any existing binding, if another * one is supplied by user. */ sk->sk_bound_dev_if = addr->sin6_scope_id; } /* Binding to link-local address requires an interface */ if (!sk->sk_bound_dev_if) goto out_unlock; } if (sk->sk_bound_dev_if) { err = -ENODEV; dev = dev_get_by_index_rcu(sock_net(sk), sk->sk_bound_dev_if); if (!dev) goto out_unlock; } /* ipv4 addr of the socket is invalid. Only the * unspecified and mapped address have a v4 equivalent. */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST) && !ipv6_can_nonlocal_bind(sock_net(sk), inet)) { err = -EADDRNOTAVAIL; if (!ipv6_chk_addr(sock_net(sk), &addr->sin6_addr, dev, 0)) { goto out_unlock; } } } inet->inet_rcv_saddr = inet->inet_saddr = v4addr; sk->sk_v6_rcv_saddr = addr->sin6_addr; if (!(addr_type & IPV6_ADDR_MULTICAST)) np->saddr = addr->sin6_addr; err = 0; out_unlock: rcu_read_unlock(); out: release_sock(sk); return err; } static void rawv6_err(struct sock *sk, struct sk_buff *skb, u8 type, u8 code, int offset, __be32 info) { bool recverr = inet6_test_bit(RECVERR6, sk); struct ipv6_pinfo *np = inet6_sk(sk); int err; int harderr; /* Report error on raw socket, if: 1. User requested recverr. 2. Socket is connected (otherwise the error indication is useless without recverr and error is hard. */ if (!recverr && sk->sk_state != TCP_ESTABLISHED) return; harderr = icmpv6_err_convert(type, code, &err); if (type == ICMPV6_PKT_TOOBIG) { ip6_sk_update_pmtu(skb, sk, info); harderr = (READ_ONCE(np->pmtudisc) == IPV6_PMTUDISC_DO); } if (type == NDISC_REDIRECT) { ip6_sk_redirect(skb, sk); return; } if (recverr) { u8 *payload = skb->data; if (!inet_test_bit(HDRINCL, sk)) payload += offset; ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload); } if (recverr || harderr) { sk->sk_err = err; sk_error_report(sk); } } void raw6_icmp_error(struct sk_buff *skb, int nexthdr, u8 type, u8 code, int inner_offset, __be32 info) { struct net *net = dev_net(skb->dev); struct hlist_head *hlist; struct sock *sk; int hash; hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); sk_for_each_rcu(sk, hlist) { /* Note: ipv6_hdr(skb) != skb->data */ const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; if (!raw_v6_match(net, sk, nexthdr, &ip6h->saddr, &ip6h->daddr, inet6_iif(skb), inet6_iif(skb))) continue; rawv6_err(sk, skb, type, code, inner_offset, info); } rcu_read_unlock(); } static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) { enum skb_drop_reason reason; if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } /* Charge it to the socket. */ skb_dst_drop(skb); if (sock_queue_rcv_skb_reason(sk, skb, &reason) < 0) { sk_skb_reason_drop(sk, skb, reason); return NET_RX_DROP; } return 0; } /* * This is next to useless... * if we demultiplex in network layer we don't need the extra call * just to queue the skb... * maybe we could have the network decide upon a hint if it * should call raw_rcv for demultiplexing */ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) { struct inet_sock *inet = inet_sk(sk); struct raw6_sock *rp = raw6_sk(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } nf_reset_ct(skb); if (!rp->checksum) skb->ip_summed = CHECKSUM_UNNECESSARY; if (skb->ip_summed == CHECKSUM_COMPLETE) { skb_postpull_rcsum(skb, skb_network_header(skb), skb_network_header_len(skb)); if (!csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len, inet->inet_num, skb->csum)) skb->ip_summed = CHECKSUM_UNNECESSARY; } if (!skb_csum_unnecessary(skb)) skb->csum = ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len, inet->inet_num, 0)); if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } } rawv6_rcv_skb(sk, skb); return 0; } /* * This should be easy, if there is something there * we return it, otherwise we block. */ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct ipv6_pinfo *np = inet6_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct sk_buff *skb; size_t copied; int err; if (flags & MSG_OOB) return -EOPNOTSUPP; if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } if (skb_csum_unnecessary(skb)) { err = skb_copy_datagram_msg(skb, 0, msg, copied); } else if (msg->msg_flags&MSG_TRUNC) { if (__skb_checksum_complete(skb)) goto csum_copy_err; err = skb_copy_datagram_msg(skb, 0, msg, copied); } else { err = skb_copy_and_csum_datagram_msg(skb, 0, msg); if (err == -EINVAL) goto csum_copy_err; } if (err) goto out_free; /* Copy the address. */ if (sin6) { sin6->sin6_family = AF_INET6; sin6->sin6_port = 0; sin6->sin6_addr = ipv6_hdr(skb)->saddr; sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = ipv6_iface_scope_id(&sin6->sin6_addr, inet6_iif(skb)); *addr_len = sizeof(*sin6); } sock_recv_cmsgs(msg, sk, skb); if (np->rxopt.all) ip6_datagram_recv_ctl(sk, msg, skb); err = copied; if (flags & MSG_TRUNC) err = skb->len; out_free: skb_free_datagram(sk, skb); out: return err; csum_copy_err: skb_kill_datagram(sk, skb, flags); /* Error for blocking case is chosen to masquerade as some normal condition. */ err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; goto out; } static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct raw6_sock *rp) { struct ipv6_txoptions *opt; struct sk_buff *skb; int err = 0; int offset; int len; int total_len; __wsum tmp_csum; __sum16 csum; if (!rp->checksum) goto send; skb = skb_peek(&sk->sk_write_queue); if (!skb) goto out; offset = rp->offset; total_len = inet_sk(sk)->cork.base.length; opt = inet6_sk(sk)->cork.opt; total_len -= opt ? opt->opt_flen : 0; if (offset >= total_len - 1) { err = -EINVAL; ip6_flush_pending_frames(sk); goto out; } /* should be check HW csum miyazawa */ if (skb_queue_len(&sk->sk_write_queue) == 1) { /* * Only one fragment on the socket. */ tmp_csum = skb->csum; } else { struct sk_buff *csum_skb = NULL; tmp_csum = 0; skb_queue_walk(&sk->sk_write_queue, skb) { tmp_csum = csum_add(tmp_csum, skb->csum); if (csum_skb) continue; len = skb->len - skb_transport_offset(skb); if (offset >= len) { offset -= len; continue; } csum_skb = skb; } skb = csum_skb; } offset += skb_transport_offset(skb); err = skb_copy_bits(skb, offset, &csum, 2); if (err < 0) { ip6_flush_pending_frames(sk); goto out; } /* in case cksum was not initialized */ if (unlikely(csum)) tmp_csum = csum_sub(tmp_csum, csum_unfold(csum)); csum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, total_len, fl6->flowi6_proto, tmp_csum); if (csum == 0 && fl6->flowi6_proto == IPPROTO_UDP) csum = CSUM_MANGLED_0; BUG_ON(skb_store_bits(skb, offset, &csum, 2)); send: err = ip6_push_pending_frames(sk); out: return err; } static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, struct flowi6 *fl6, struct dst_entry **dstp, unsigned int flags, const struct sockcm_cookie *sockc) { struct net *net = sock_net(sk); struct ipv6hdr *iph; struct sk_buff *skb; int err; struct rt6_info *rt = dst_rt6_info(*dstp); int hlen = LL_RESERVED_SPACE(rt->dst.dev); int tlen = rt->dst.dev->needed_tailroom; if (length > rt->dst.dev->mtu) { ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); return -EMSGSIZE; } if (length < sizeof(struct ipv6hdr)) return -EINVAL; if (flags&MSG_PROBE) goto out; skb = sock_alloc_send_skb(sk, length + hlen + tlen + 15, flags & MSG_DONTWAIT, &err); if (!skb) goto error; skb_reserve(skb, hlen); skb->protocol = htons(ETH_P_IPV6); skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); skb_put(skb, length); skb_reset_network_header(skb); iph = ipv6_hdr(skb); skb->ip_summed = CHECKSUM_NONE; skb_setup_tx_timestamp(skb, sockc); if (flags & MSG_CONFIRM) skb_set_dst_pending_confirm(skb, 1); skb->transport_header = skb->network_header; err = memcpy_from_msg(iph, msg, length); if (err) { err = -EFAULT; kfree_skb(skb); goto error; } skb_dst_set(skb, &rt->dst); *dstp = NULL; /* if egress device is enslaved to an L3 master device pass the * skb to its handler for processing */ skb = l3mdev_ip6_out(sk, skb); if (unlikely(!skb)) return 0; /* Acquire rcu_read_lock() in case we need to use rt->rt6i_idev * in the error path. Since skb has been freed, the dst could * have been queued for deletion. */ rcu_read_lock(); IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); err = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, skb, NULL, rt->dst.dev, dst_output); if (err > 0) err = net_xmit_errno(err); if (err) { IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); goto error_check; } rcu_read_unlock(); out: return 0; error: IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); error_check: if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) err = 0; return err; } struct raw6_frag_vec { struct msghdr *msg; int hlen; char c[4]; }; static int rawv6_probe_proto_opt(struct raw6_frag_vec *rfv, struct flowi6 *fl6) { int err = 0; switch (fl6->flowi6_proto) { case IPPROTO_ICMPV6: rfv->hlen = 2; err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen); if (!err) { fl6->fl6_icmp_type = rfv->c[0]; fl6->fl6_icmp_code = rfv->c[1]; } break; case IPPROTO_MH: rfv->hlen = 4; err = memcpy_from_msg(rfv->c, rfv->msg, rfv->hlen); if (!err) fl6->fl6_mh_type = rfv->c[2]; } return err; } static int raw6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct raw6_frag_vec *rfv = from; if (offset < rfv->hlen) { int copy = min(rfv->hlen - offset, len); if (skb->ip_summed == CHECKSUM_PARTIAL) memcpy(to, rfv->c + offset, copy); else skb->csum = csum_block_add( skb->csum, csum_partial_copy_nocheck(rfv->c + offset, to, copy), odd); odd = 0; offset += copy; to += copy; len -= copy; if (!len) return 0; } offset -= rfv->hlen; return ip_generic_getfrag(rfv->msg, to, offset, len, odd, skb); } static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct ipv6_txoptions *opt_to_free = NULL; struct ipv6_txoptions opt_space; DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name); struct in6_addr *daddr, *final_p, final; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct raw6_sock *rp = raw6_sk(sk); struct ipv6_txoptions *opt = NULL; struct ip6_flowlabel *flowlabel = NULL; struct dst_entry *dst = NULL; struct raw6_frag_vec rfv; struct flowi6 fl6; struct ipcm6_cookie ipc6; int addr_len = msg->msg_namelen; int hdrincl; u16 proto; int err; /* Rough check on arithmetic overflow, better check is made in ip6_append_data(). */ if (len > INT_MAX) return -EMSGSIZE; /* Mirror BSD error message compatibility */ if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; hdrincl = inet_test_bit(HDRINCL, sk); ipcm6_init_sk(&ipc6, sk); /* * Get and verify the address. */ memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_mark = ipc6.sockc.mark; fl6.flowi6_uid = sk_uid(sk); if (sin6) { if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (sin6->sin6_family && sin6->sin6_family != AF_INET6) return -EAFNOSUPPORT; /* port is the proto value [0..255] carried in nexthdr */ proto = ntohs(sin6->sin6_port); if (!proto) proto = inet->inet_num; else if (proto != inet->inet_num && inet->inet_num != IPPROTO_RAW) return -EINVAL; if (proto > 255) return -EINVAL; daddr = &sin6->sin6_addr; if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } } /* * Otherwise it will be difficult to maintain * sk->sk_dst_cache. */ if (sk->sk_state == TCP_ESTABLISHED && ipv6_addr_equal(daddr, &sk->sk_v6_daddr)) daddr = &sk->sk_v6_daddr; if (addr_len >= sizeof(struct sockaddr_in6) && sin6->sin6_scope_id && __ipv6_addr_needs_scope_id(__ipv6_addr_type(daddr))) fl6.flowi6_oif = sin6->sin6_scope_id; } else { if (sk->sk_state != TCP_ESTABLISHED) return -EDESTADDRREQ; proto = inet->inet_num; daddr = &sk->sk_v6_daddr; fl6.flowlabel = np->flow_label; } if (fl6.flowi6_oif == 0) fl6.flowi6_oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { opt = &opt_space; memset(opt, 0, sizeof(struct ipv6_txoptions)); opt->tot_len = sizeof(struct ipv6_txoptions); ipc6.opt = opt; err = ip6_datagram_send_ctl(sock_net(sk), sk, msg, &fl6, &ipc6); if (err < 0) { fl6_sock_release(flowlabel); return err; } if ((fl6.flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; } if (!(opt->opt_nflen|opt->opt_flen)) opt = NULL; } if (!opt) { opt = txopt_get(np); opt_to_free = opt; } if (flowlabel) opt = fl6_merge_options(&opt_space, flowlabel, opt); opt = ipv6_fixup_options(&opt_space, opt); fl6.flowi6_proto = proto; fl6.flowi6_mark = ipc6.sockc.mark; if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; err = rawv6_probe_proto_opt(&rfv, &fl6); if (err) goto out; } if (!ipv6_addr_any(daddr)) fl6.daddr = *daddr; else fl6.daddr.s6_addr[15] = 0x1; /* :: means loopback (BSD'ism) */ if (ipv6_addr_any(&fl6.saddr) && !ipv6_addr_any(&np->saddr)) fl6.saddr = np->saddr; final_p = fl6_update_dst(&fl6, opt, &final); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); if (hdrincl) fl6.flowi6_flags |= FLOWI_FLAG_KNOWN_NH; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; } if (ipc6.hlimit < 0) ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); if (msg->msg_flags&MSG_CONFIRM) goto do_confirm; back_from_confirm: if (hdrincl) err = rawv6_send_hdrinc(sk, msg, len, &fl6, &dst, msg->msg_flags, &ipc6.sockc); else { ipc6.opt = opt; lock_sock(sk); err = ip6_append_data(sk, raw6_getfrag, &rfv, len, 0, &ipc6, &fl6, dst_rt6_info(dst), msg->msg_flags); if (err) ip6_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) err = rawv6_push_pending_frames(sk, &fl6, rp); release_sock(sk); } done: dst_release(dst); out: fl6_sock_release(flowlabel); txopt_put(opt_to_free); return err < 0 ? err : len; do_confirm: if (msg->msg_flags & MSG_PROBE) dst_confirm_neigh(dst, &fl6.daddr); if (!(msg->msg_flags & MSG_PROBE) || len) goto back_from_confirm; err = 0; goto done; } static int rawv6_seticmpfilter(struct sock *sk, int optname, sockptr_t optval, int optlen) { switch (optname) { case ICMPV6_FILTER: if (optlen > sizeof(struct icmp6_filter)) optlen = sizeof(struct icmp6_filter); if (copy_from_sockptr(&raw6_sk(sk)->filter, optval, optlen)) return -EFAULT; return 0; default: return -ENOPROTOOPT; } return 0; } static int rawv6_geticmpfilter(struct sock *sk, int optname, char __user *optval, int __user *optlen) { int len; switch (optname) { case ICMPV6_FILTER: if (get_user(len, optlen)) return -EFAULT; if (len < 0) return -EINVAL; if (len > sizeof(struct icmp6_filter)) len = sizeof(struct icmp6_filter); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &raw6_sk(sk)->filter, len)) return -EFAULT; return 0; default: return -ENOPROTOOPT; } return 0; } static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct raw6_sock *rp = raw6_sk(sk); int val; if (optlen < sizeof(val)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; switch (optname) { case IPV6_HDRINCL: if (sk->sk_type != SOCK_RAW) return -EINVAL; inet_assign_bit(HDRINCL, sk, val); return 0; case IPV6_CHECKSUM: if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 && level == IPPROTO_IPV6) { /* * RFC3542 tells that IPV6_CHECKSUM socket * option in the IPPROTO_IPV6 level is not * allowed on ICMPv6 sockets. * If you want to set it, use IPPROTO_RAW * level IPV6_CHECKSUM socket option * (Linux extension). */ return -EINVAL; } /* You may get strange result with a positive odd offset; RFC2292bis agrees with me. */ if (val > 0 && (val&1)) return -EINVAL; if (val < 0) { rp->checksum = 0; } else { rp->checksum = 1; rp->offset = val; } return 0; default: return -ENOPROTOOPT; } } static int rawv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { switch (level) { case SOL_RAW: break; case SOL_ICMPV6: if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; return rawv6_seticmpfilter(sk, optname, optval, optlen); case SOL_IPV6: if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; fallthrough; default: return ipv6_setsockopt(sk, level, optname, optval, optlen); } return do_rawv6_setsockopt(sk, level, optname, optval, optlen); } static int do_rawv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct raw6_sock *rp = raw6_sk(sk); int val, len; if (get_user(len, optlen)) return -EFAULT; switch (optname) { case IPV6_HDRINCL: val = inet_test_bit(HDRINCL, sk); break; case IPV6_CHECKSUM: /* * We allow getsockopt() for IPPROTO_IPV6-level * IPV6_CHECKSUM socket option on ICMPv6 sockets * since RFC3542 is silent about it. */ if (rp->checksum == 0) val = -1; else val = rp->offset; break; default: return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); if (put_user(len, optlen)) return -EFAULT; if (copy_to_user(optval, &val, len)) return -EFAULT; return 0; } static int rawv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { switch (level) { case SOL_RAW: break; case SOL_ICMPV6: if (inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; return rawv6_geticmpfilter(sk, optname, optval, optlen); case SOL_IPV6: if (optname == IPV6_CHECKSUM || optname == IPV6_HDRINCL) break; fallthrough; default: return ipv6_getsockopt(sk, level, optname, optval, optlen); } return do_rawv6_getsockopt(sk, level, optname, optval, optlen); } static int rawv6_ioctl(struct sock *sk, int cmd, int *karg) { switch (cmd) { case SIOCOUTQ: { *karg = sk_wmem_alloc_get(sk); return 0; } case SIOCINQ: { struct sk_buff *skb; spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) *karg = skb->len; else *karg = 0; spin_unlock_bh(&sk->sk_receive_queue.lock); return 0; } default: #ifdef CONFIG_IPV6_MROUTE return ip6mr_ioctl(sk, cmd, karg); #else return -ENOIOCTLCMD; #endif } } #ifdef CONFIG_COMPAT static int compat_rawv6_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) { switch (cmd) { case SIOCOUTQ: case SIOCINQ: return -ENOIOCTLCMD; default: #ifdef CONFIG_IPV6_MROUTE return ip6mr_compat_ioctl(sk, cmd, compat_ptr(arg)); #else return -ENOIOCTLCMD; #endif } } #endif static void rawv6_close(struct sock *sk, long timeout) { if (inet_sk(sk)->inet_num == IPPROTO_RAW) ip6_ra_control(sk, -1); ip6mr_sk_done(sk); sk_common_release(sk); } static void raw6_destroy(struct sock *sk) { lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); } static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); sk->sk_drop_counters = &rp->drop_counters; switch (inet_sk(sk)->inet_num) { case IPPROTO_ICMPV6: rp->checksum = 1; rp->offset = 2; break; case IPPROTO_MH: rp->checksum = 1; rp->offset = 4; break; default: break; } return 0; } struct proto rawv6_prot = { .name = "RAWv6", .owner = THIS_MODULE, .close = rawv6_close, .destroy = raw6_destroy, .connect = ip6_datagram_connect_v6_only, .disconnect = __udp_disconnect, .ioctl = rawv6_ioctl, .init = rawv6_init_sk, .setsockopt = rawv6_setsockopt, .getsockopt = rawv6_getsockopt, .sendmsg = rawv6_sendmsg, .recvmsg = rawv6_recvmsg, .bind = rawv6_bind, .backlog_rcv = rawv6_rcv_skb, .hash = raw_hash_sk, .unhash = raw_unhash_sk, .obj_size = sizeof(struct raw6_sock), .ipv6_pinfo_offset = offsetof(struct raw6_sock, inet6), .useroffset = offsetof(struct raw6_sock, filter), .usersize = sizeof_field(struct raw6_sock, filter), .h.raw_hash = &raw_v6_hashinfo, #ifdef CONFIG_COMPAT .compat_ioctl = compat_rawv6_ioctl, #endif .diag_destroy = raw_abort, }; #ifdef CONFIG_PROC_FS static int raw6_seq_show(struct seq_file *seq, void *v) { if (v == SEQ_START_TOKEN) { seq_puts(seq, IPV6_SEQ_DGRAM_HEADER); } else { struct sock *sp = v; __u16 srcp = inet_sk(sp)->inet_num; ip6_dgram_sock_seq_show(seq, v, srcp, 0, raw_seq_private(seq)->bucket); } return 0; } static const struct seq_operations raw6_seq_ops = { .start = raw_seq_start, .next = raw_seq_next, .stop = raw_seq_stop, .show = raw6_seq_show, }; static int __net_init raw6_init_net(struct net *net) { if (!proc_create_net_data("raw6", 0444, net->proc_net, &raw6_seq_ops, sizeof(struct raw_iter_state), &raw_v6_hashinfo)) return -ENOMEM; return 0; } static void __net_exit raw6_exit_net(struct net *net) { remove_proc_entry("raw6", net->proc_net); } static struct pernet_operations raw6_net_ops = { .init = raw6_init_net, .exit = raw6_exit_net, }; int __init raw6_proc_init(void) { return register_pernet_subsys(&raw6_net_ops); } void raw6_proc_exit(void) { unregister_pernet_subsys(&raw6_net_ops); } #endif /* CONFIG_PROC_FS */ /* Same as inet6_dgram_ops, sans udp_poll. */ const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, .connect = inet_dgram_connect, /* ok */ .socketpair = sock_no_socketpair, /* a do nothing */ .accept = sock_no_accept, /* a do nothing */ .getname = inet6_getname, .poll = datagram_poll, /* ok */ .ioctl = inet6_ioctl, /* must change */ .gettstamp = sock_gettstamp, .listen = sock_no_listen, /* ok */ .shutdown = inet_shutdown, /* ok */ .setsockopt = sock_common_setsockopt, /* ok */ .getsockopt = sock_common_getsockopt, /* ok */ .sendmsg = inet_sendmsg, /* ok */ .recvmsg = sock_common_recvmsg, /* ok */ .mmap = sock_no_mmap, #ifdef CONFIG_COMPAT .compat_ioctl = inet6_compat_ioctl, #endif }; static struct inet_protosw rawv6_protosw = { .type = SOCK_RAW, .protocol = IPPROTO_IP, /* wild card */ .prot = &rawv6_prot, .ops = &inet6_sockraw_ops, .flags = INET_PROTOSW_REUSE, }; int __init rawv6_init(void) { return inet6_register_protosw(&rawv6_protosw); } void rawv6_exit(void) { inet6_unregister_protosw(&rawv6_protosw); }
7 6 7 5 6 7 7 1 7 7 7 2 7 7 31 7 31 7 7 7 24 24 53 26 27 27 27 54 10 27 17 41 3 16 28 26 2 26 2 24 4 25 3 27 1 28 28 55 1 29 29 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 // SPDX-License-Identifier: GPL-2.0-only /* net/sched/sch_hhf.c Heavy-Hitter Filter (HHF) * * Copyright (C) 2013 Terry Lam <vtlam@google.com> * Copyright (C) 2013 Nandita Dukkipati <nanditad@google.com> */ #include <linux/jiffies.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/siphash.h> #include <net/pkt_sched.h> #include <net/sock.h> /* Heavy-Hitter Filter (HHF) * * Principles : * Flows are classified into two buckets: non-heavy-hitter and heavy-hitter * buckets. Initially, a new flow starts as non-heavy-hitter. Once classified * as heavy-hitter, it is immediately switched to the heavy-hitter bucket. * The buckets are dequeued by a Weighted Deficit Round Robin (WDRR) scheduler, * in which the heavy-hitter bucket is served with less weight. * In other words, non-heavy-hitters (e.g., short bursts of critical traffic) * are isolated from heavy-hitters (e.g., persistent bulk traffic) and also have * higher share of bandwidth. * * To capture heavy-hitters, we use the "multi-stage filter" algorithm in the * following paper: * [EV02] C. Estan and G. Varghese, "New Directions in Traffic Measurement and * Accounting", in ACM SIGCOMM, 2002. * * Conceptually, a multi-stage filter comprises k independent hash functions * and k counter arrays. Packets are indexed into k counter arrays by k hash * functions, respectively. The counters are then increased by the packet sizes. * Therefore, * - For a heavy-hitter flow: *all* of its k array counters must be large. * - For a non-heavy-hitter flow: some of its k array counters can be large * due to hash collision with other small flows; however, with high * probability, not *all* k counters are large. * * By the design of the multi-stage filter algorithm, the false negative rate * (heavy-hitters getting away uncaptured) is zero. However, the algorithm is * susceptible to false positives (non-heavy-hitters mistakenly classified as * heavy-hitters). * Therefore, we also implement the following optimizations to reduce false * positives by avoiding unnecessary increment of the counter values: * - Optimization O1: once a heavy-hitter is identified, its bytes are not * accounted in the array counters. This technique is called "shielding" * in Section 3.3.1 of [EV02]. * - Optimization O2: conservative update of counters * (Section 3.3.2 of [EV02]), * New counter value = max {old counter value, * smallest counter value + packet bytes} * * Finally, we refresh the counters periodically since otherwise the counter * values will keep accumulating. * * Once a flow is classified as heavy-hitter, we also save its per-flow state * in an exact-matching flow table so that its subsequent packets can be * dispatched to the heavy-hitter bucket accordingly. * * * At a high level, this qdisc works as follows: * Given a packet p: * - If the flow-id of p (e.g., TCP 5-tuple) is already in the exact-matching * heavy-hitter flow table, denoted table T, then send p to the heavy-hitter * bucket. * - Otherwise, forward p to the multi-stage filter, denoted filter F * + If F decides that p belongs to a non-heavy-hitter flow, then send p * to the non-heavy-hitter bucket. * + Otherwise, if F decides that p belongs to a new heavy-hitter flow, * then set up a new flow entry for the flow-id of p in the table T and * send p to the heavy-hitter bucket. * * In this implementation: * - T is a fixed-size hash-table with 1024 entries. Hash collision is * resolved by linked-list chaining. * - F has four counter arrays, each array containing 1024 32-bit counters. * That means 4 * 1024 * 32 bits = 16KB of memory. * - Since each array in F contains 1024 counters, 10 bits are sufficient to * index into each array. * Hence, instead of having four hash functions, we chop the 32-bit * skb-hash into three 10-bit chunks, and the remaining 10-bit chunk is * computed as XOR sum of those three chunks. * - We need to clear the counter arrays periodically; however, directly * memsetting 16KB of memory can lead to cache eviction and unwanted delay. * So by representing each counter by a valid bit, we only need to reset * 4K of 1 bit (i.e. 512 bytes) instead of 16KB of memory. * - The Deficit Round Robin engine is taken from fq_codel implementation * (net/sched/sch_fq_codel.c). Note that wdrr_bucket corresponds to * fq_codel_flow in fq_codel implementation. * */ /* Non-configurable parameters */ #define HH_FLOWS_CNT 1024 /* number of entries in exact-matching table T */ #define HHF_ARRAYS_CNT 4 /* number of arrays in multi-stage filter F */ #define HHF_ARRAYS_LEN 1024 /* number of counters in each array of F */ #define HHF_BIT_MASK_LEN 10 /* masking 10 bits */ #define HHF_BIT_MASK 0x3FF /* bitmask of 10 bits */ #define WDRR_BUCKET_CNT 2 /* two buckets for Weighted DRR */ enum wdrr_bucket_idx { WDRR_BUCKET_FOR_HH = 0, /* bucket id for heavy-hitters */ WDRR_BUCKET_FOR_NON_HH = 1 /* bucket id for non-heavy-hitters */ }; #define hhf_time_before(a, b) \ (typecheck(u32, a) && typecheck(u32, b) && ((s32)((a) - (b)) < 0)) /* Heavy-hitter per-flow state */ struct hh_flow_state { u32 hash_id; /* hash of flow-id (e.g. TCP 5-tuple) */ u32 hit_timestamp; /* last time heavy-hitter was seen */ struct list_head flowchain; /* chaining under hash collision */ }; /* Weighted Deficit Round Robin (WDRR) scheduler */ struct wdrr_bucket { struct sk_buff *head; struct sk_buff *tail; struct list_head bucketchain; int deficit; }; struct hhf_sched_data { struct wdrr_bucket buckets[WDRR_BUCKET_CNT]; siphash_key_t perturbation; /* hash perturbation */ u32 quantum; /* psched_mtu(qdisc_dev(sch)); */ u32 drop_overlimit; /* number of times max qdisc packet * limit was hit */ struct list_head *hh_flows; /* table T (currently active HHs) */ u32 hh_flows_limit; /* max active HH allocs */ u32 hh_flows_overlimit; /* num of disallowed HH allocs */ u32 hh_flows_total_cnt; /* total admitted HHs */ u32 hh_flows_current_cnt; /* total current HHs */ u32 *hhf_arrays[HHF_ARRAYS_CNT]; /* HH filter F */ u32 hhf_arrays_reset_timestamp; /* last time hhf_arrays * was reset */ unsigned long *hhf_valid_bits[HHF_ARRAYS_CNT]; /* shadow valid bits * of hhf_arrays */ /* Similar to the "new_flows" vs. "old_flows" concept in fq_codel DRR */ struct list_head new_buckets; /* list of new buckets */ struct list_head old_buckets; /* list of old buckets */ /* Configurable HHF parameters */ u32 hhf_reset_timeout; /* interval to reset counter * arrays in filter F * (default 40ms) */ u32 hhf_admit_bytes; /* counter thresh to classify as * HH (default 128KB). * With these default values, * 128KB / 40ms = 25 Mbps * i.e., we expect to capture HHs * sending > 25 Mbps. */ u32 hhf_evict_timeout; /* aging threshold to evict idle * HHs out of table T. This should * be large enough to avoid * reordering during HH eviction. * (default 1s) */ u32 hhf_non_hh_weight; /* WDRR weight for non-HHs * (default 2, * i.e., non-HH : HH = 2 : 1) */ }; static u32 hhf_time_stamp(void) { return jiffies; } /* Looks up a heavy-hitter flow in a chaining list of table T. */ static struct hh_flow_state *seek_list(const u32 hash, struct list_head *head, struct hhf_sched_data *q) { struct hh_flow_state *flow, *next; u32 now = hhf_time_stamp(); if (list_empty(head)) return NULL; list_for_each_entry_safe(flow, next, head, flowchain) { u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; if (hhf_time_before(prev, now)) { /* Delete expired heavy-hitters, but preserve one entry * to avoid kzalloc() when next time this slot is hit. */ if (list_is_last(&flow->flowchain, head)) return NULL; list_del(&flow->flowchain); kfree(flow); q->hh_flows_current_cnt--; } else if (flow->hash_id == hash) { return flow; } } return NULL; } /* Returns a flow state entry for a new heavy-hitter. Either reuses an expired * entry or dynamically alloc a new entry. */ static struct hh_flow_state *alloc_new_hh(struct list_head *head, struct hhf_sched_data *q) { struct hh_flow_state *flow; u32 now = hhf_time_stamp(); if (!list_empty(head)) { /* Find an expired heavy-hitter flow entry. */ list_for_each_entry(flow, head, flowchain) { u32 prev = flow->hit_timestamp + q->hhf_evict_timeout; if (hhf_time_before(prev, now)) return flow; } } if (q->hh_flows_current_cnt >= q->hh_flows_limit) { q->hh_flows_overlimit++; return NULL; } /* Create new entry. */ flow = kzalloc(sizeof(struct hh_flow_state), GFP_ATOMIC); if (!flow) return NULL; q->hh_flows_current_cnt++; INIT_LIST_HEAD(&flow->flowchain); list_add_tail(&flow->flowchain, head); return flow; } /* Assigns packets to WDRR buckets. Implements a multi-stage filter to * classify heavy-hitters. */ static enum wdrr_bucket_idx hhf_classify(struct sk_buff *skb, struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); u32 tmp_hash, hash; u32 xorsum, filter_pos[HHF_ARRAYS_CNT], flow_pos; struct hh_flow_state *flow; u32 pkt_len, min_hhf_val; int i; u32 prev; u32 now = hhf_time_stamp(); /* Reset the HHF counter arrays if this is the right time. */ prev = q->hhf_arrays_reset_timestamp + q->hhf_reset_timeout; if (hhf_time_before(prev, now)) { for (i = 0; i < HHF_ARRAYS_CNT; i++) bitmap_zero(q->hhf_valid_bits[i], HHF_ARRAYS_LEN); q->hhf_arrays_reset_timestamp = now; } /* Get hashed flow-id of the skb. */ hash = skb_get_hash_perturb(skb, &q->perturbation); /* Check if this packet belongs to an already established HH flow. */ flow_pos = hash & HHF_BIT_MASK; flow = seek_list(hash, &q->hh_flows[flow_pos], q); if (flow) { /* found its HH flow */ flow->hit_timestamp = now; return WDRR_BUCKET_FOR_HH; } /* Now pass the packet through the multi-stage filter. */ tmp_hash = hash; xorsum = 0; for (i = 0; i < HHF_ARRAYS_CNT - 1; i++) { /* Split the skb_hash into three 10-bit chunks. */ filter_pos[i] = tmp_hash & HHF_BIT_MASK; xorsum ^= filter_pos[i]; tmp_hash >>= HHF_BIT_MASK_LEN; } /* The last chunk is computed as XOR sum of other chunks. */ filter_pos[HHF_ARRAYS_CNT - 1] = xorsum ^ tmp_hash; pkt_len = qdisc_pkt_len(skb); min_hhf_val = ~0U; for (i = 0; i < HHF_ARRAYS_CNT; i++) { u32 val; if (!test_bit(filter_pos[i], q->hhf_valid_bits[i])) { q->hhf_arrays[i][filter_pos[i]] = 0; __set_bit(filter_pos[i], q->hhf_valid_bits[i]); } val = q->hhf_arrays[i][filter_pos[i]] + pkt_len; if (min_hhf_val > val) min_hhf_val = val; } /* Found a new HH iff all counter values > HH admit threshold. */ if (min_hhf_val > q->hhf_admit_bytes) { /* Just captured a new heavy-hitter. */ flow = alloc_new_hh(&q->hh_flows[flow_pos], q); if (!flow) /* memory alloc problem */ return WDRR_BUCKET_FOR_NON_HH; flow->hash_id = hash; flow->hit_timestamp = now; q->hh_flows_total_cnt++; /* By returning without updating counters in q->hhf_arrays, * we implicitly implement "shielding" (see Optimization O1). */ return WDRR_BUCKET_FOR_HH; } /* Conservative update of HHF arrays (see Optimization O2). */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { if (q->hhf_arrays[i][filter_pos[i]] < min_hhf_val) q->hhf_arrays[i][filter_pos[i]] = min_hhf_val; } return WDRR_BUCKET_FOR_NON_HH; } /* Removes one skb from head of bucket. */ static struct sk_buff *dequeue_head(struct wdrr_bucket *bucket) { struct sk_buff *skb = bucket->head; bucket->head = skb->next; skb_mark_not_on_list(skb); return skb; } /* Tail-adds skb to bucket. */ static void bucket_add(struct wdrr_bucket *bucket, struct sk_buff *skb) { if (bucket->head == NULL) bucket->head = skb; else bucket->tail->next = skb; bucket->tail = skb; skb->next = NULL; } static unsigned int hhf_drop(struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); struct wdrr_bucket *bucket; /* Always try to drop from heavy-hitters first. */ bucket = &q->buckets[WDRR_BUCKET_FOR_HH]; if (!bucket->head) bucket = &q->buckets[WDRR_BUCKET_FOR_NON_HH]; if (bucket->head) { struct sk_buff *skb = dequeue_head(bucket); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); qdisc_drop(skb, sch, to_free); } /* Return id of the bucket from which the packet was dropped. */ return bucket - q->buckets; } static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct hhf_sched_data *q = qdisc_priv(sch); enum wdrr_bucket_idx idx; struct wdrr_bucket *bucket; unsigned int prev_backlog; idx = hhf_classify(skb, sch); bucket = &q->buckets[idx]; bucket_add(bucket, skb); qdisc_qstats_backlog_inc(sch, skb); if (list_empty(&bucket->bucketchain)) { unsigned int weight; /* The logic of new_buckets vs. old_buckets is the same as * new_flows vs. old_flows in the implementation of fq_codel, * i.e., short bursts of non-HHs should have strict priority. */ if (idx == WDRR_BUCKET_FOR_HH) { /* Always move heavy-hitters to old bucket. */ weight = 1; list_add_tail(&bucket->bucketchain, &q->old_buckets); } else { weight = q->hhf_non_hh_weight; list_add_tail(&bucket->bucketchain, &q->new_buckets); } bucket->deficit = weight * q->quantum; } if (++sch->q.qlen <= sch->limit) return NET_XMIT_SUCCESS; prev_backlog = sch->qstats.backlog; q->drop_overlimit++; /* Return Congestion Notification only if we dropped a packet from this * bucket. */ if (hhf_drop(sch, to_free) == idx) return NET_XMIT_CN; /* As we dropped a packet, better let upper stack know this. */ qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog); return NET_XMIT_SUCCESS; } static struct sk_buff *hhf_dequeue(struct Qdisc *sch) { struct hhf_sched_data *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct wdrr_bucket *bucket; struct list_head *head; begin: head = &q->new_buckets; if (list_empty(head)) { head = &q->old_buckets; if (list_empty(head)) return NULL; } bucket = list_first_entry(head, struct wdrr_bucket, bucketchain); if (bucket->deficit <= 0) { int weight = (bucket - q->buckets == WDRR_BUCKET_FOR_HH) ? 1 : q->hhf_non_hh_weight; bucket->deficit += weight * q->quantum; list_move_tail(&bucket->bucketchain, &q->old_buckets); goto begin; } if (bucket->head) { skb = dequeue_head(bucket); sch->q.qlen--; qdisc_qstats_backlog_dec(sch, skb); } if (!skb) { /* Force a pass through old_buckets to prevent starvation. */ if ((head == &q->new_buckets) && !list_empty(&q->old_buckets)) list_move_tail(&bucket->bucketchain, &q->old_buckets); else list_del_init(&bucket->bucketchain); goto begin; } qdisc_bstats_update(sch, skb); bucket->deficit -= qdisc_pkt_len(skb); return skb; } static void hhf_reset(struct Qdisc *sch) { struct sk_buff *skb; while ((skb = hhf_dequeue(sch)) != NULL) rtnl_kfree_skbs(skb, skb); } static void hhf_destroy(struct Qdisc *sch) { int i; struct hhf_sched_data *q = qdisc_priv(sch); for (i = 0; i < HHF_ARRAYS_CNT; i++) { kvfree(q->hhf_arrays[i]); kvfree(q->hhf_valid_bits[i]); } if (!q->hh_flows) return; for (i = 0; i < HH_FLOWS_CNT; i++) { struct hh_flow_state *flow, *next; struct list_head *head = &q->hh_flows[i]; if (list_empty(head)) continue; list_for_each_entry_safe(flow, next, head, flowchain) { list_del(&flow->flowchain); kfree(flow); } } kvfree(q->hh_flows); } static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { [TCA_HHF_BACKLOG_LIMIT] = { .type = NLA_U32 }, [TCA_HHF_QUANTUM] = { .type = NLA_U32 }, [TCA_HHF_HH_FLOWS_LIMIT] = { .type = NLA_U32 }, [TCA_HHF_RESET_TIMEOUT] = { .type = NLA_U32 }, [TCA_HHF_ADMIT_BYTES] = { .type = NLA_U32 }, [TCA_HHF_EVICT_TIMEOUT] = { .type = NLA_U32 }, [TCA_HHF_NON_HH_WEIGHT] = { .type = NLA_U32 }, }; static int hhf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { unsigned int dropped_pkts = 0, dropped_bytes = 0; struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; u32 new_hhf_non_hh_weight = q->hhf_non_hh_weight; err = nla_parse_nested_deprecated(tb, TCA_HHF_MAX, opt, hhf_policy, NULL); if (err < 0) return err; if (tb[TCA_HHF_QUANTUM]) new_quantum = nla_get_u32(tb[TCA_HHF_QUANTUM]); if (tb[TCA_HHF_NON_HH_WEIGHT]) new_hhf_non_hh_weight = nla_get_u32(tb[TCA_HHF_NON_HH_WEIGHT]); non_hh_quantum = (u64)new_quantum * new_hhf_non_hh_weight; if (non_hh_quantum == 0 || non_hh_quantum > INT_MAX) return -EINVAL; sch_tree_lock(sch); if (tb[TCA_HHF_BACKLOG_LIMIT]) WRITE_ONCE(sch->limit, nla_get_u32(tb[TCA_HHF_BACKLOG_LIMIT])); WRITE_ONCE(q->quantum, new_quantum); WRITE_ONCE(q->hhf_non_hh_weight, new_hhf_non_hh_weight); if (tb[TCA_HHF_HH_FLOWS_LIMIT]) WRITE_ONCE(q->hh_flows_limit, nla_get_u32(tb[TCA_HHF_HH_FLOWS_LIMIT])); if (tb[TCA_HHF_RESET_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_RESET_TIMEOUT]); WRITE_ONCE(q->hhf_reset_timeout, usecs_to_jiffies(us)); } if (tb[TCA_HHF_ADMIT_BYTES]) WRITE_ONCE(q->hhf_admit_bytes, nla_get_u32(tb[TCA_HHF_ADMIT_BYTES])); if (tb[TCA_HHF_EVICT_TIMEOUT]) { u32 us = nla_get_u32(tb[TCA_HHF_EVICT_TIMEOUT]); WRITE_ONCE(q->hhf_evict_timeout, usecs_to_jiffies(us)); } while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; dropped_pkts++; dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; } static int hhf_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct hhf_sched_data *q = qdisc_priv(sch); int i; sch->limit = 1000; q->quantum = psched_mtu(qdisc_dev(sch)); get_random_bytes(&q->perturbation, sizeof(q->perturbation)); INIT_LIST_HEAD(&q->new_buckets); INIT_LIST_HEAD(&q->old_buckets); /* Configurable HHF parameters */ q->hhf_reset_timeout = HZ / 25; /* 40 ms */ q->hhf_admit_bytes = 131072; /* 128 KB */ q->hhf_evict_timeout = HZ; /* 1 sec */ q->hhf_non_hh_weight = 2; if (opt) { int err = hhf_change(sch, opt, extack); if (err) return err; } if (!q->hh_flows) { /* Initialize heavy-hitter flow table. */ q->hh_flows = kvcalloc(HH_FLOWS_CNT, sizeof(struct list_head), GFP_KERNEL); if (!q->hh_flows) return -ENOMEM; for (i = 0; i < HH_FLOWS_CNT; i++) INIT_LIST_HEAD(&q->hh_flows[i]); /* Cap max active HHs at twice len of hh_flows table. */ q->hh_flows_limit = 2 * HH_FLOWS_CNT; q->hh_flows_overlimit = 0; q->hh_flows_total_cnt = 0; q->hh_flows_current_cnt = 0; /* Initialize heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { q->hhf_arrays[i] = kvcalloc(HHF_ARRAYS_LEN, sizeof(u32), GFP_KERNEL); if (!q->hhf_arrays[i]) { /* Note: hhf_destroy() will be called * by our caller. */ return -ENOMEM; } } q->hhf_arrays_reset_timestamp = hhf_time_stamp(); /* Initialize valid bits of heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN / BITS_PER_BYTE, GFP_KERNEL); if (!q->hhf_valid_bits[i]) { /* Note: hhf_destroy() will be called * by our caller. */ return -ENOMEM; } } /* Initialize Weighted DRR buckets. */ for (i = 0; i < WDRR_BUCKET_CNT; i++) { struct wdrr_bucket *bucket = q->buckets + i; INIT_LIST_HEAD(&bucket->bucketchain); } } return 0; } static int hhf_dump(struct Qdisc *sch, struct sk_buff *skb) { struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *opts; opts = nla_nest_start_noflag(skb, TCA_OPTIONS); if (opts == NULL) goto nla_put_failure; if (nla_put_u32(skb, TCA_HHF_BACKLOG_LIMIT, READ_ONCE(sch->limit)) || nla_put_u32(skb, TCA_HHF_QUANTUM, READ_ONCE(q->quantum)) || nla_put_u32(skb, TCA_HHF_HH_FLOWS_LIMIT, READ_ONCE(q->hh_flows_limit)) || nla_put_u32(skb, TCA_HHF_RESET_TIMEOUT, jiffies_to_usecs(READ_ONCE(q->hhf_reset_timeout))) || nla_put_u32(skb, TCA_HHF_ADMIT_BYTES, READ_ONCE(q->hhf_admit_bytes)) || nla_put_u32(skb, TCA_HHF_EVICT_TIMEOUT, jiffies_to_usecs(READ_ONCE(q->hhf_evict_timeout))) || nla_put_u32(skb, TCA_HHF_NON_HH_WEIGHT, READ_ONCE(q->hhf_non_hh_weight))) goto nla_put_failure; return nla_nest_end(skb, opts); nla_put_failure: return -1; } static int hhf_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct hhf_sched_data *q = qdisc_priv(sch); struct tc_hhf_xstats st = { .drop_overlimit = q->drop_overlimit, .hh_overlimit = q->hh_flows_overlimit, .hh_tot_count = q->hh_flows_total_cnt, .hh_cur_count = q->hh_flows_current_cnt, }; return gnet_stats_copy_app(d, &st, sizeof(st)); } static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .id = "hhf", .priv_size = sizeof(struct hhf_sched_data), .enqueue = hhf_enqueue, .dequeue = hhf_dequeue, .peek = qdisc_peek_dequeued, .init = hhf_init, .reset = hhf_reset, .destroy = hhf_destroy, .change = hhf_change, .dump = hhf_dump, .dump_stats = hhf_dump_stats, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("hhf"); static int __init hhf_module_init(void) { return register_qdisc(&hhf_qdisc_ops); } static void __exit hhf_module_exit(void) { unregister_qdisc(&hhf_qdisc_ops); } module_init(hhf_module_init) module_exit(hhf_module_exit) MODULE_AUTHOR("Terry Lam"); MODULE_AUTHOR("Nandita Dukkipati"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Heavy-Hitter Filter (HHF)");
5 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 // SPDX-License-Identifier: GPL-2.0-or-later /* * LAPB release 002 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * History * LAPB 001 Jonathan Naylor Started Coding * LAPB 002 Jonathan Naylor New timer architecture. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/lapb.h> static void lapb_t1timer_expiry(struct timer_list *); static void lapb_t2timer_expiry(struct timer_list *); void lapb_start_t1timer(struct lapb_cb *lapb) { timer_delete(&lapb->t1timer); lapb->t1timer.function = lapb_t1timer_expiry; lapb->t1timer.expires = jiffies + lapb->t1; lapb->t1timer_running = true; add_timer(&lapb->t1timer); } void lapb_start_t2timer(struct lapb_cb *lapb) { timer_delete(&lapb->t2timer); lapb->t2timer.function = lapb_t2timer_expiry; lapb->t2timer.expires = jiffies + lapb->t2; lapb->t2timer_running = true; add_timer(&lapb->t2timer); } void lapb_stop_t1timer(struct lapb_cb *lapb) { lapb->t1timer_running = false; timer_delete(&lapb->t1timer); } void lapb_stop_t2timer(struct lapb_cb *lapb) { lapb->t2timer_running = false; timer_delete(&lapb->t2timer); } int lapb_t1timer_running(struct lapb_cb *lapb) { return lapb->t1timer_running; } static void lapb_t2timer_expiry(struct timer_list *t) { struct lapb_cb *lapb = timer_container_of(lapb, t, t2timer); spin_lock_bh(&lapb->lock); if (timer_pending(&lapb->t2timer)) /* A new timer has been set up */ goto out; if (!lapb->t2timer_running) /* The timer has been stopped */ goto out; if (lapb->condition & LAPB_ACK_PENDING_CONDITION) { lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; lapb_timeout_response(lapb); } lapb->t2timer_running = false; out: spin_unlock_bh(&lapb->lock); } static void lapb_t1timer_expiry(struct timer_list *t) { struct lapb_cb *lapb = timer_container_of(lapb, t, t1timer); spin_lock_bh(&lapb->lock); if (timer_pending(&lapb->t1timer)) /* A new timer has been set up */ goto out; if (!lapb->t1timer_running) /* The timer has been stopped */ goto out; switch (lapb->state) { /* * If we are a DCE, send DM up to N2 times, then switch to * STATE_1 and send SABM(E). */ case LAPB_STATE_0: if (lapb->mode & LAPB_DCE && lapb->n2count != lapb->n2) { lapb->n2count++; lapb_send_control(lapb, LAPB_DM, LAPB_POLLOFF, LAPB_RESPONSE); } else { lapb->state = LAPB_STATE_1; lapb_establish_data_link(lapb); } break; /* * Awaiting connection state, send SABM(E), up to N2 times. */ case LAPB_STATE_1: if (lapb->n2count == lapb->n2) { lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S1 -> S0\n", lapb->dev); lapb->t1timer_running = false; goto out; } else { lapb->n2count++; if (lapb->mode & LAPB_EXTENDED) { lapb_dbg(1, "(%p) S1 TX SABME(1)\n", lapb->dev); lapb_send_control(lapb, LAPB_SABME, LAPB_POLLON, LAPB_COMMAND); } else { lapb_dbg(1, "(%p) S1 TX SABM(1)\n", lapb->dev); lapb_send_control(lapb, LAPB_SABM, LAPB_POLLON, LAPB_COMMAND); } } break; /* * Awaiting disconnection state, send DISC, up to N2 times. */ case LAPB_STATE_2: if (lapb->n2count == lapb->n2) { lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb_disconnect_confirmation(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S2 -> S0\n", lapb->dev); lapb->t1timer_running = false; goto out; } else { lapb->n2count++; lapb_dbg(1, "(%p) S2 TX DISC(1)\n", lapb->dev); lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); } break; /* * Data transfer state, restransmit I frames, up to N2 times. */ case LAPB_STATE_3: if (lapb->n2count == lapb->n2) { lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb_stop_t2timer(lapb); lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S3 -> S0\n", lapb->dev); lapb->t1timer_running = false; goto out; } else { lapb->n2count++; lapb_requeue_frames(lapb); lapb_kick(lapb); } break; /* * Frame reject state, restransmit FRMR frames, up to N2 times. */ case LAPB_STATE_4: if (lapb->n2count == lapb->n2) { lapb_clear_queues(lapb); lapb->state = LAPB_STATE_0; lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); lapb_dbg(0, "(%p) S4 -> S0\n", lapb->dev); lapb->t1timer_running = false; goto out; } else { lapb->n2count++; lapb_transmit_frmr(lapb); } break; } lapb_start_t1timer(lapb); out: spin_unlock_bh(&lapb->lock); }
1011 792 544 544 545 543 545 1281 1013 1160 138 138 138 136 136 138 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 // SPDX-License-Identifier: GPL-2.0-only /* * Network port table * * SELinux must keep a mapping of network ports to labels/SIDs. This * mapping is maintained as part of the normal policy but a fast cache is * needed to reduce the lookup overhead. * * Author: Paul Moore <paul@paul-moore.com> * * This code is heavily based on the "netif" concept originally developed by * James Morris <jmorris@redhat.com> * (see security/selinux/netif.c for more information) */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2008 */ #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/ip.h> #include <net/ipv6.h> #include "netport.h" #include "objsec.h" #define SEL_NETPORT_HASH_SIZE 256 #define SEL_NETPORT_HASH_BKT_LIMIT 16 struct sel_netport_bkt { int size; struct list_head list; }; struct sel_netport { struct netport_security_struct psec; struct list_head list; struct rcu_head rcu; }; static DEFINE_SPINLOCK(sel_netport_lock); static struct sel_netport_bkt sel_netport_hash[SEL_NETPORT_HASH_SIZE]; /** * sel_netport_hashfn - Hashing function for the port table * @pnum: port number * * Description: * This is the hashing function for the port table, it returns the bucket * number for the given port. * */ static unsigned int sel_netport_hashfn(u16 pnum) { return (pnum & (SEL_NETPORT_HASH_SIZE - 1)); } /** * sel_netport_find - Search for a port record * @protocol: protocol * @pnum: port * * Description: * Search the network port table and return the matching record. If an entry * can not be found in the table return NULL. * */ static struct sel_netport *sel_netport_find(u8 protocol, u16 pnum) { unsigned int idx; struct sel_netport *port; idx = sel_netport_hashfn(pnum); list_for_each_entry_rcu(port, &sel_netport_hash[idx].list, list) if (port->psec.port == pnum && port->psec.protocol == protocol) return port; return NULL; } /** * sel_netport_insert - Insert a new port into the table * @port: the new port record * * Description: * Add a new port record to the network address hash table. * */ static void sel_netport_insert(struct sel_netport *port) { unsigned int idx; /* we need to impose a limit on the growth of the hash table so check * this bucket to make sure it is within the specified bounds */ idx = sel_netport_hashfn(port->psec.port); list_add_rcu(&port->list, &sel_netport_hash[idx].list); if (sel_netport_hash[idx].size == SEL_NETPORT_HASH_BKT_LIMIT) { struct sel_netport *tail; tail = list_entry( rcu_dereference_protected( list_tail_rcu(&sel_netport_hash[idx].list), lockdep_is_held(&sel_netport_lock)), struct sel_netport, list); list_del_rcu(&tail->list); kfree_rcu(tail, rcu); } else sel_netport_hash[idx].size++; } /** * sel_netport_sid_slow - Lookup the SID of a network address using the policy * @protocol: protocol * @pnum: port * @sid: port SID * * Description: * This function determines the SID of a network port by querying the security * policy. The result is added to the network port table to speedup future * queries. Returns zero on success, negative values on failure. * */ static int sel_netport_sid_slow(u8 protocol, u16 pnum, u32 *sid) { int ret; struct sel_netport *port; struct sel_netport *new; spin_lock_bh(&sel_netport_lock); port = sel_netport_find(protocol, pnum); if (port != NULL) { *sid = port->psec.sid; spin_unlock_bh(&sel_netport_lock); return 0; } ret = security_port_sid(protocol, pnum, sid); if (ret != 0) goto out; /* If this memory allocation fails still return 0. The SID * is valid, it just won't be added to the cache. */ new = kmalloc(sizeof(*new), GFP_ATOMIC); if (new) { new->psec.port = pnum; new->psec.protocol = protocol; new->psec.sid = *sid; sel_netport_insert(new); } out: spin_unlock_bh(&sel_netport_lock); if (unlikely(ret)) pr_warn("SELinux: failure in %s(), unable to determine network port label\n", __func__); return ret; } /** * sel_netport_sid - Lookup the SID of a network port * @protocol: protocol * @pnum: port * @sid: port SID * * Description: * This function determines the SID of a network port using the fastest method * possible. First the port table is queried, but if an entry can't be found * then the policy is queried and the result is added to the table to speedup * future queries. Returns zero on success, negative values on failure. * */ int sel_netport_sid(u8 protocol, u16 pnum, u32 *sid) { struct sel_netport *port; rcu_read_lock(); port = sel_netport_find(protocol, pnum); if (likely(port != NULL)) { *sid = port->psec.sid; rcu_read_unlock(); return 0; } rcu_read_unlock(); return sel_netport_sid_slow(protocol, pnum, sid); } /** * sel_netport_flush - Flush the entire network port table * * Description: * Remove all entries from the network address table. * */ void sel_netport_flush(void) { unsigned int idx; struct sel_netport *port, *port_tmp; spin_lock_bh(&sel_netport_lock); for (idx = 0; idx < SEL_NETPORT_HASH_SIZE; idx++) { list_for_each_entry_safe(port, port_tmp, &sel_netport_hash[idx].list, list) { list_del_rcu(&port->list); kfree_rcu(port, rcu); } sel_netport_hash[idx].size = 0; } spin_unlock_bh(&sel_netport_lock); } static __init int sel_netport_init(void) { int iter; if (!selinux_enabled_boot) return 0; for (iter = 0; iter < SEL_NETPORT_HASH_SIZE; iter++) { INIT_LIST_HEAD(&sel_netport_hash[iter].list); sel_netport_hash[iter].size = 0; } return 0; } __initcall(sel_netport_init);
130 89 92 1 39 3 39 4 37 6 8 35 2 41 41 2 45 3 42 4 4 4 410 520 410 248 561 41 42 199 106 116 1 216 26 199 92 553 409 555 5099 5253 52 18 50 208 208 66 66 3 3 10 3 18 6 13 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 // SPDX-License-Identifier: GPL-2.0-only /* * fs/kernfs/inode.c - kernfs inode implementation * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <tj@kernel.org> */ #include <linux/pagemap.h> #include <linux/backing-dev.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/security.h> #include "kernfs-internal.h" static const struct inode_operations kernfs_iops = { .permission = kernfs_iop_permission, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .listxattr = kernfs_iop_listxattr, }; static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, bool alloc) { struct kernfs_iattrs *ret __free(kfree) = NULL; struct kernfs_iattrs *attr; attr = READ_ONCE(kn->iattr); if (attr || !alloc) return attr; ret = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL); if (!ret) return NULL; /* assign default attributes */ ret->ia_uid = GLOBAL_ROOT_UID; ret->ia_gid = GLOBAL_ROOT_GID; ktime_get_real_ts64(&ret->ia_atime); ret->ia_mtime = ret->ia_atime; ret->ia_ctime = ret->ia_atime; simple_xattrs_init(&ret->xattrs); atomic_set(&ret->nr_user_xattrs, 0); atomic_set(&ret->user_xattr_size, 0); /* If someone raced us, recognize it. */ if (!try_cmpxchg(&kn->iattr, &attr, ret)) return READ_ONCE(kn->iattr); return no_free_ptr(ret); } static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn) { return __kernfs_iattrs(kn, true); } static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn) { return __kernfs_iattrs(kn, false); } int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { struct kernfs_iattrs *attrs; unsigned int ia_valid = iattr->ia_valid; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; if (ia_valid & ATTR_UID) attrs->ia_uid = iattr->ia_uid; if (ia_valid & ATTR_GID) attrs->ia_gid = iattr->ia_gid; if (ia_valid & ATTR_ATIME) attrs->ia_atime = iattr->ia_atime; if (ia_valid & ATTR_MTIME) attrs->ia_mtime = iattr->ia_mtime; if (ia_valid & ATTR_CTIME) attrs->ia_ctime = iattr->ia_ctime; if (ia_valid & ATTR_MODE) kn->mode = iattr->ia_mode; return 0; } /** * kernfs_setattr - set iattr on a node * @kn: target node * @iattr: iattr to set * * Return: %0 on success, -errno on failure. */ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { int ret; struct kernfs_root *root = kernfs_root(kn); down_write(&root->kernfs_iattr_rwsem); ret = __kernfs_setattr(kn, iattr); up_write(&root->kernfs_iattr_rwsem); return ret; } int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct kernfs_node *kn = inode->i_private; struct kernfs_root *root; int error; if (!kn) return -EINVAL; root = kernfs_root(kn); down_write(&root->kernfs_iattr_rwsem); error = setattr_prepare(&nop_mnt_idmap, dentry, iattr); if (error) goto out; error = __kernfs_setattr(kn, iattr); if (error) goto out; /* this ignores size changes */ setattr_copy(&nop_mnt_idmap, inode, iattr); out: up_write(&root->kernfs_iattr_rwsem); return error; } ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) { struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_iattrs *attrs; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); } static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; simple_inode_init_ts(inode); } static inline void set_inode_attr(struct inode *inode, struct kernfs_iattrs *attrs) { inode->i_uid = attrs->ia_uid; inode->i_gid = attrs->ia_gid; inode_set_atime_to_ts(inode, attrs->ia_atime); inode_set_mtime_to_ts(inode, attrs->ia_mtime); inode_set_ctime_to_ts(inode, attrs->ia_ctime); } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) { struct kernfs_iattrs *attrs; inode->i_mode = kn->mode; attrs = kernfs_iattrs_noalloc(kn); if (attrs) /* * kernfs_node has non-default attributes get them from * persistent copy in kernfs_node. */ set_inode_attr(inode, attrs); if (kernfs_type(kn) == KERNFS_DIR) set_nlink(inode, kn->dir.subdirs + 2); } int kernfs_iop_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); up_read(&root->kernfs_iattr_rwsem); return 0; } static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) { kernfs_get(kn); inode->i_private = kn; inode->i_mapping->a_ops = &ram_aops; inode->i_op = &kernfs_iops; inode->i_generation = kernfs_gen(kn); set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); /* initialize inode according to type */ switch (kernfs_type(kn)) { case KERNFS_DIR: inode->i_op = &kernfs_dir_iops; inode->i_fop = &kernfs_dir_fops; if (kn->flags & KERNFS_EMPTY_DIR) make_empty_dir_inode(inode); break; case KERNFS_FILE: inode->i_size = kn->attr.size; inode->i_fop = &kernfs_file_fops; break; case KERNFS_LINK: inode->i_op = &kernfs_symlink_iops; break; default: BUG(); } unlock_new_inode(inode); } /** * kernfs_get_inode - get inode for kernfs_node * @sb: super block * @kn: kernfs_node to allocate inode for * * Get inode for @kn. If such inode doesn't exist, a new inode is * allocated and basics are initialized. New inode is returned * locked. * * Locking: * Kernel thread context (may sleep). * * Return: * Pointer to allocated inode on success, %NULL on failure. */ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; inode = iget_locked(sb, kernfs_ino(kn)); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); return inode; } /* * The kernfs_node serves as both an inode and a directory entry for * kernfs. To prevent the kernfs inode numbers from being freed * prematurely we take a reference to kernfs_node from the kernfs inode. A * super_operations.evict_inode() implementation is needed to drop that * reference upon inode destruction. */ void kernfs_evict_inode(struct inode *inode) { struct kernfs_node *kn = inode->i_private; truncate_inode_pages_final(&inode->i_data); clear_inode(inode); kernfs_put(kn); } int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct kernfs_node *kn; struct kernfs_root *root; int ret; if (mask & MAY_NOT_BLOCK) return -ECHILD; kn = inode->i_private; root = kernfs_root(kn); down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); ret = generic_permission(&nop_mnt_idmap, inode, mask); up_read(&root->kernfs_iattr_rwsem); return ret; } int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size) { struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn); if (!attrs) return -ENODATA; return simple_xattr_get(&attrs->xattrs, name, value, size); } int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { struct simple_xattr *old_xattr; struct kernfs_iattrs *attrs; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags); if (IS_ERR(old_xattr)) return PTR_ERR(old_xattr); simple_xattr_free(old_xattr); return 0; } static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *suffix, void *value, size_t size) { const char *name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; return kernfs_xattr_get(kn, name, value, size); } static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) { const char *name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; return kernfs_xattr_set(kn, name, value, size, flags); } static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, const char *full_name, struct simple_xattrs *xattrs, const void *value, size_t size, int flags) { struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn); atomic_t *sz = &attr->user_xattr_size; atomic_t *nr = &attr->nr_user_xattrs; struct simple_xattr *old_xattr; int ret; if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { ret = -ENOSPC; goto dec_count_out; } if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) { ret = -ENOSPC; goto dec_size_out; } old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); if (!old_xattr) return 0; if (IS_ERR(old_xattr)) { ret = PTR_ERR(old_xattr); goto dec_size_out; } ret = 0; size = old_xattr->size; simple_xattr_free(old_xattr); dec_size_out: atomic_sub(size, sz); dec_count_out: atomic_dec(nr); return ret; } static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, const char *full_name, struct simple_xattrs *xattrs, const void *value, size_t size, int flags) { struct kernfs_iattrs *attr = kernfs_iattrs_noalloc(kn); atomic_t *sz = &attr->user_xattr_size; atomic_t *nr = &attr->nr_user_xattrs; struct simple_xattr *old_xattr; old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); if (!old_xattr) return 0; if (IS_ERR(old_xattr)) return PTR_ERR(old_xattr); atomic_sub(old_xattr->size, sz); atomic_dec(nr); simple_xattr_free(old_xattr); return 0; } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) { const char *full_name = xattr_full_name(handler, suffix); struct kernfs_node *kn = inode->i_private; struct kernfs_iattrs *attrs; if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR)) return -EOPNOTSUPP; attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; if (value) return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs, value, size, flags); else return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs, value, size, flags); } static const struct xattr_handler kernfs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = kernfs_vfs_xattr_get, .set = kernfs_vfs_xattr_set, }; static const struct xattr_handler kernfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = kernfs_vfs_xattr_get, .set = kernfs_vfs_xattr_set, }; static const struct xattr_handler kernfs_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, .get = kernfs_vfs_xattr_get, .set = kernfs_vfs_user_xattr_set, }; const struct xattr_handler * const kernfs_xattr_handlers[] = { &kernfs_trusted_xattr_handler, &kernfs_security_xattr_handler, &kernfs_user_xattr_handler, NULL };
12 4 8 2 1 4 5 5 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net> * Copyright (c) 2013 Eric Leblond <eric@regit.org> * * Development of this code funded by Astaro AG (http://www.astaro.com/) */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_reject.h> #include <linux/icmp.h> #include <linux/icmpv6.h> const struct nla_policy nft_reject_policy[NFTA_REJECT_MAX + 1] = { [NFTA_REJECT_TYPE] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_REJECT_ICMP_CODE] = { .type = NLA_U8 }, }; EXPORT_SYMBOL_GPL(nft_reject_policy); int nft_reject_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD) | (1 << NF_INET_LOCAL_OUT) | (1 << NF_INET_PRE_ROUTING)); } EXPORT_SYMBOL_GPL(nft_reject_validate); int nft_reject_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); int icmp_code; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; priv->type = ntohl(nla_get_be32(tb[NFTA_REJECT_TYPE])); switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: case NFT_REJECT_ICMPX_UNREACH: if (tb[NFTA_REJECT_ICMP_CODE] == NULL) return -EINVAL; icmp_code = nla_get_u8(tb[NFTA_REJECT_ICMP_CODE]); if (priv->type == NFT_REJECT_ICMPX_UNREACH && icmp_code > NFT_REJECT_ICMPX_MAX) return -EINVAL; priv->icmp_code = icmp_code; break; case NFT_REJECT_TCP_RST: break; default: return -EINVAL; } return 0; } EXPORT_SYMBOL_GPL(nft_reject_init); int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset) { const struct nft_reject *priv = nft_expr_priv(expr); if (nla_put_be32(skb, NFTA_REJECT_TYPE, htonl(priv->type))) goto nla_put_failure; switch (priv->type) { case NFT_REJECT_ICMP_UNREACH: case NFT_REJECT_ICMPX_UNREACH: if (nla_put_u8(skb, NFTA_REJECT_ICMP_CODE, priv->icmp_code)) goto nla_put_failure; break; default: break; } return 0; nla_put_failure: return -1; } EXPORT_SYMBOL_GPL(nft_reject_dump); static u8 icmp_code_v4[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMP_NET_UNREACH, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMP_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMP_HOST_UNREACH, [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMP_PKT_FILTERED, }; int nft_reject_icmp_code(u8 code) { if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) return ICMP_NET_UNREACH; return icmp_code_v4[code]; } EXPORT_SYMBOL_GPL(nft_reject_icmp_code); static u8 icmp_code_v6[NFT_REJECT_ICMPX_MAX + 1] = { [NFT_REJECT_ICMPX_NO_ROUTE] = ICMPV6_NOROUTE, [NFT_REJECT_ICMPX_PORT_UNREACH] = ICMPV6_PORT_UNREACH, [NFT_REJECT_ICMPX_HOST_UNREACH] = ICMPV6_ADDR_UNREACH, [NFT_REJECT_ICMPX_ADMIN_PROHIBITED] = ICMPV6_ADM_PROHIBITED, }; int nft_reject_icmpv6_code(u8 code) { if (WARN_ON_ONCE(code > NFT_REJECT_ICMPX_MAX)) return ICMPV6_NOROUTE; return icmp_code_v6[code]; } EXPORT_SYMBOL_GPL(nft_reject_icmpv6_code); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>"); MODULE_DESCRIPTION("Netfilter x_tables over nftables module");
3 3 3 35 187 3925 14 4093 4091 4094 4026 4029 4031 2104 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * vma.h * * Core VMA manipulation API implemented in vma.c. */ #ifndef __MM_VMA_H #define __MM_VMA_H /* * VMA lock generalization */ struct vma_prepare { struct vm_area_struct *vma; struct vm_area_struct *adj_next; struct file *file; struct address_space *mapping; struct anon_vma *anon_vma; struct vm_area_struct *insert; struct vm_area_struct *remove; struct vm_area_struct *remove2; bool skip_vma_uprobe :1; }; struct unlink_vma_file_batch { int count; struct vm_area_struct *vmas[8]; }; /* * vma munmap operation */ struct vma_munmap_struct { struct vma_iterator *vmi; struct vm_area_struct *vma; /* The first vma to munmap */ struct vm_area_struct *prev; /* vma before the munmap area */ struct vm_area_struct *next; /* vma after the munmap area */ struct list_head *uf; /* Userfaultfd list_head */ unsigned long start; /* Aligned start addr (inclusive) */ unsigned long end; /* Aligned end addr (exclusive) */ unsigned long unmap_start; /* Unmap PTE start */ unsigned long unmap_end; /* Unmap PTE end */ int vma_count; /* Number of vmas that will be removed */ bool unlock; /* Unlock after the munmap */ bool clear_ptes; /* If there are outstanding PTE to be cleared */ /* 2 byte hole */ unsigned long nr_pages; /* Number of pages being removed */ unsigned long locked_vm; /* Number of locked pages */ unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */ unsigned long exec_vm; unsigned long stack_vm; unsigned long data_vm; }; enum vma_merge_state { VMA_MERGE_START, VMA_MERGE_ERROR_NOMEM, VMA_MERGE_NOMERGE, VMA_MERGE_SUCCESS, }; /* * Describes a VMA merge operation and is threaded throughout it. * * Any of the fields may be mutated by the merge operation, so no guarantees are * made to the contents of this structure after a merge operation has completed. */ struct vma_merge_struct { struct mm_struct *mm; struct vma_iterator *vmi; /* * Adjacent VMAs, any of which may be NULL if not present: * * |------|--------|------| * | prev | middle | next | * |------|--------|------| * * middle may not yet exist in the case of a proposed new VMA being * merged, or it may be an existing VMA. * * next may be assigned by the caller. */ struct vm_area_struct *prev; struct vm_area_struct *middle; struct vm_area_struct *next; /* This is the VMA we ultimately target to become the merged VMA. */ struct vm_area_struct *target; /* * Initially, the start, end, pgoff fields are provided by the caller * and describe the proposed new VMA range, whether modifying an * existing VMA (which will be 'middle'), or adding a new one. * * During the merge process these fields are updated to describe the new * range _including those VMAs which will be merged_. */ unsigned long start; unsigned long end; pgoff_t pgoff; vm_flags_t vm_flags; struct file *file; struct anon_vma *anon_vma; struct mempolicy *policy; struct vm_userfaultfd_ctx uffd_ctx; struct anon_vma_name *anon_name; enum vma_merge_state state; /* Flags which callers can use to modify merge behaviour: */ /* * If we can expand, simply do so. We know there is nothing to merge to * the right. Does not reset state upon failure to merge. The VMA * iterator is assumed to be positioned at the previous VMA, rather than * at the gap. */ bool just_expand :1; /* * If a merge is possible, but an OOM error occurs, give up and don't * execute the merge, returning NULL. */ bool give_up_on_oom :1; /* * If set, skip uprobe_mmap upon merged vma. */ bool skip_vma_uprobe :1; /* Internal flags set during merge process: */ /* * Internal flag indicating the merge increases vmg->middle->vm_start * (and thereby, vmg->prev->vm_end). */ bool __adjust_middle_start :1; /* * Internal flag indicating the merge decreases vmg->next->vm_start * (and thereby, vmg->middle->vm_end). */ bool __adjust_next_start :1; /* * Internal flag used during the merge operation to indicate we will * remove vmg->middle. */ bool __remove_middle :1; /* * Internal flag used during the merge operation to indicate we will * remove vmg->next. */ bool __remove_next :1; }; static inline bool vmg_nomem(struct vma_merge_struct *vmg) { return vmg->state == VMA_MERGE_ERROR_NOMEM; } /* Assumes addr >= vma->vm_start. */ static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma, unsigned long addr) { return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start); } #define VMG_STATE(name, mm_, vmi_, start_, end_, vm_flags_, pgoff_) \ struct vma_merge_struct name = { \ .mm = mm_, \ .vmi = vmi_, \ .start = start_, \ .end = end_, \ .vm_flags = vm_flags_, \ .pgoff = pgoff_, \ .state = VMA_MERGE_START, \ } #define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \ struct vma_merge_struct name = { \ .mm = vma_->vm_mm, \ .vmi = vmi_, \ .prev = prev_, \ .middle = vma_, \ .next = NULL, \ .start = start_, \ .end = end_, \ .vm_flags = vma_->vm_flags, \ .pgoff = vma_pgoff_offset(vma_, start_), \ .file = vma_->vm_file, \ .anon_vma = vma_->anon_vma, \ .policy = vma_policy(vma_), \ .uffd_ctx = vma_->vm_userfaultfd_ctx, \ .anon_name = anon_vma_name(vma_), \ .state = VMA_MERGE_START, \ } #ifdef CONFIG_DEBUG_VM_MAPLE_TREE void validate_mm(struct mm_struct *mm); #else #define validate_mm(mm) do { } while (0) #endif __must_check int vma_expand(struct vma_merge_struct *vmg); __must_check int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff); static inline int vma_iter_store_gfp(struct vma_iterator *vmi, struct vm_area_struct *vma, gfp_t gfp) { if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_gfp(&vmi->mas, vma, gfp); if (unlikely(mas_is_err(&vmi->mas))) return -ENOMEM; vma_mark_attached(vma); return 0; } /* * Temporary helper function for stacked mmap handlers which specify * f_op->mmap() but which might have an underlying file system which implements * f_op->mmap_prepare(). */ static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc) { /* * Since we're invoking .mmap_prepare() despite having a partially * established VMA, we must take care to handle setting fields * correctly. */ /* Mutable fields. Populated with initial state. */ vma->vm_pgoff = desc->pgoff; if (desc->vm_file != vma->vm_file) vma_set_file(vma, desc->vm_file); if (desc->vm_flags != vma->vm_flags) vm_flags_set(vma, desc->vm_flags); vma->vm_page_prot = desc->page_prot; /* User-defined fields. */ vma->vm_ops = desc->vm_ops; vma->vm_private_data = desc->private_data; } int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool unlock); int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf, bool unlock); void remove_vma(struct vm_area_struct *vma); void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, struct vm_area_struct *prev, struct vm_area_struct *next); /* We are about to modify the VMA's flags. */ __must_check struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, vm_flags_t vm_flags); /* We are about to modify the VMA's anon_name. */ __must_check struct vm_area_struct *vma_modify_name(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct anon_vma_name *new_name); /* We are about to modify the VMA's memory policy. */ __must_check struct vm_area_struct *vma_modify_policy(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct mempolicy *new_pol); /* We are about to modify the VMA's flags and/or uffd context. */ __must_check struct vm_area_struct *vma_modify_flags_uffd(struct vma_iterator *vmi, struct vm_area_struct *prev, struct vm_area_struct *vma, unsigned long start, unsigned long end, vm_flags_t vm_flags, struct vm_userfaultfd_ctx new_ctx, bool give_up_on_oom); __must_check struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg); __must_check struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long delta); void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb); void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, struct vm_area_struct *vma); void unlink_file_vma(struct vm_area_struct *vma); void vma_link_file(struct vm_area_struct *vma); int vma_link(struct mm_struct *mm, struct vm_area_struct *vma); struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma); bool vma_needs_dirty_tracking(struct vm_area_struct *vma); bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); int mm_take_all_locks(struct mm_struct *mm); void mm_drop_all_locks(struct mm_struct *mm); unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf); int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *brkvma, unsigned long addr, unsigned long request, unsigned long flags); unsigned long unmapped_area(struct vm_unmapped_area_info *info); unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info); static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* * We want to check manually if we can change individual PTEs writable * if we can't do that automatically for all PTEs in a mapping. For * private mappings, that's always the case when we have write * permissions as we properly have to handle COW. */ if (vma->vm_flags & VM_SHARED) return vma_wants_writenotify(vma, vma->vm_page_prot); return !!(vma->vm_flags & VM_WRITE); } #ifdef CONFIG_MMU static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, vm_flags_t vm_flags) { return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); } #endif static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev(&vmi->mas, min); } /* * These three helpers classifies VMAs for virtual memory accounting. */ /* * Executable code area - executable, not writable, not stack */ static inline bool is_exec_mapping(vm_flags_t flags) { return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC; } /* * Stack area (including shadow stacks) * * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous: * do_mmap() forbids all other combinations. */ static inline bool is_stack_mapping(vm_flags_t flags) { return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK); } /* * Data area - private, writable, not stack */ static inline bool is_data_mapping(vm_flags_t flags) { return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } static inline void vma_iter_config(struct vma_iterator *vmi, unsigned long index, unsigned long last) { __mas_set_range(&vmi->mas, index, last - 1); } static inline void vma_iter_reset(struct vma_iterator *vmi) { mas_reset(&vmi->mas); } static inline struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min) { return mas_prev_range(&vmi->mas, min); } static inline struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max) { return mas_next_range(&vmi->mas, max); } static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min, unsigned long max, unsigned long size) { return mas_empty_area(&vmi->mas, min, max - 1, size); } static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min, unsigned long max, unsigned long size) { return mas_empty_area_rev(&vmi->mas, min, max - 1, size); } /* * VMA Iterator functions shared between nommu and mmap */ static inline int vma_iter_prealloc(struct vma_iterator *vmi, struct vm_area_struct *vma) { return mas_preallocate(&vmi->mas, vma, GFP_KERNEL); } static inline void vma_iter_clear(struct vma_iterator *vmi) { mas_store_prealloc(&vmi->mas, NULL); } static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi) { return mas_walk(&vmi->mas); } /* Store a VMA with preallocated memory */ static inline void vma_iter_store_overwrite(struct vma_iterator *vmi, struct vm_area_struct *vma) { vma_assert_attached(vma); #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.index > vma->vm_start)) { pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n", vmi->mas.index, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start && vmi->mas.last < vma->vm_start)) { pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n", vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end, vmi->mas.index, vmi->mas.last); } #endif if (vmi->mas.status != ma_start && ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start))) vma_iter_invalidate(vmi); __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1); mas_store_prealloc(&vmi->mas, vma); } static inline void vma_iter_store_new(struct vma_iterator *vmi, struct vm_area_struct *vma) { vma_mark_attached(vma); vma_iter_store_overwrite(vmi, vma); } static inline unsigned long vma_iter_addr(struct vma_iterator *vmi) { return vmi->mas.index; } static inline unsigned long vma_iter_end(struct vma_iterator *vmi) { return vmi->mas.last + 1; } static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi, unsigned long count) { return mas_expected_entries(&vmi->mas, count); } static inline struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi) { return mas_prev_range(&vmi->mas, 0); } /* * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or * if no previous VMA, to index 0. */ static inline struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi, struct vm_area_struct **pprev) { struct vm_area_struct *next = vma_next(vmi); struct vm_area_struct *prev = vma_prev(vmi); /* * Consider the case where no previous VMA exists. We advance to the * next VMA, skipping any gap, then rewind to the start of the range. * * If we were to unconditionally advance to the next range we'd wind up * at the next VMA again, so we check to ensure there is a previous VMA * to skip over. */ if (prev) vma_iter_next_range(vmi); if (pprev) *pprev = prev; return next; } #ifdef CONFIG_64BIT static inline bool vma_is_sealed(struct vm_area_struct *vma) { return (vma->vm_flags & VM_SEALED); } #else static inline bool vma_is_sealed(struct vm_area_struct *vma) { return false; } #endif #if defined(CONFIG_STACK_GROWSUP) int expand_upwards(struct vm_area_struct *vma, unsigned long address); #endif int expand_downwards(struct vm_area_struct *vma, unsigned long address); int __vm_munmap(unsigned long start, size_t len, bool unlock); int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma); /* vma_init.h, shared between CONFIG_MMU and nommu. */ void __init vma_state_init(void); struct vm_area_struct *vm_area_alloc(struct mm_struct *mm); struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig); void vm_area_free(struct vm_area_struct *vma); /* vma_exec.c */ #ifdef CONFIG_MMU int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap, unsigned long *top_mem_p); int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift); #endif #endif /* __MM_VMA_H */
40 40 51 2 53 9 53 9 53 53 53 54 1 6 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <rdma/rdma_netlink.h> #include <net/addrconf.h> #include "rxe.h" #include "rxe_loc.h" MODULE_AUTHOR("Bob Pearson, Frank Zago, John Groves, Kamal Heib"); MODULE_DESCRIPTION("Soft RDMA transport"); MODULE_LICENSE("Dual BSD/GPL"); /* free resources for a rxe device all objects created for this device must * have been destroyed */ void rxe_dealloc(struct ib_device *ib_dev) { struct rxe_dev *rxe = container_of(ib_dev, struct rxe_dev, ib_dev); rxe_pool_cleanup(&rxe->uc_pool); rxe_pool_cleanup(&rxe->pd_pool); rxe_pool_cleanup(&rxe->ah_pool); rxe_pool_cleanup(&rxe->srq_pool); rxe_pool_cleanup(&rxe->qp_pool); rxe_pool_cleanup(&rxe->cq_pool); rxe_pool_cleanup(&rxe->mr_pool); rxe_pool_cleanup(&rxe->mw_pool); WARN_ON(!RB_EMPTY_ROOT(&rxe->mcg_tree)); mutex_destroy(&rxe->usdev_lock); } static const struct ib_device_ops rxe_ib_dev_odp_ops = { .advise_mr = rxe_ib_advise_mr, }; /* initialize rxe device parameters */ static void rxe_init_device_param(struct rxe_dev *rxe, struct net_device *ndev) { rxe->max_inline_data = RXE_MAX_INLINE_DATA; rxe->attr.vendor_id = RXE_VENDOR_ID; rxe->attr.max_mr_size = RXE_MAX_MR_SIZE; rxe->attr.page_size_cap = RXE_PAGE_SIZE_CAP; rxe->attr.max_qp = RXE_MAX_QP; rxe->attr.max_qp_wr = RXE_MAX_QP_WR; rxe->attr.device_cap_flags = RXE_DEVICE_CAP_FLAGS; rxe->attr.kernel_cap_flags = IBK_ALLOW_USER_UNREG; rxe->attr.max_send_sge = RXE_MAX_SGE; rxe->attr.max_recv_sge = RXE_MAX_SGE; rxe->attr.max_sge_rd = RXE_MAX_SGE_RD; rxe->attr.max_cq = RXE_MAX_CQ; rxe->attr.max_cqe = (1 << RXE_MAX_LOG_CQE) - 1; rxe->attr.max_mr = RXE_MAX_MR; rxe->attr.max_mw = RXE_MAX_MW; rxe->attr.max_pd = RXE_MAX_PD; rxe->attr.max_qp_rd_atom = RXE_MAX_QP_RD_ATOM; rxe->attr.max_res_rd_atom = RXE_MAX_RES_RD_ATOM; rxe->attr.max_qp_init_rd_atom = RXE_MAX_QP_INIT_RD_ATOM; rxe->attr.atomic_cap = IB_ATOMIC_HCA; rxe->attr.max_mcast_grp = RXE_MAX_MCAST_GRP; rxe->attr.max_mcast_qp_attach = RXE_MAX_MCAST_QP_ATTACH; rxe->attr.max_total_mcast_qp_attach = RXE_MAX_TOT_MCAST_QP_ATTACH; rxe->attr.max_ah = RXE_MAX_AH; rxe->attr.max_srq = RXE_MAX_SRQ; rxe->attr.max_srq_wr = RXE_MAX_SRQ_WR; rxe->attr.max_srq_sge = RXE_MAX_SRQ_SGE; rxe->attr.max_fast_reg_page_list_len = RXE_MAX_FMR_PAGE_LIST_LEN; rxe->attr.max_pkeys = RXE_MAX_PKEYS; rxe->attr.local_ca_ack_delay = RXE_LOCAL_CA_ACK_DELAY; if (ndev->addr_len) { memcpy(rxe->raw_gid, ndev->dev_addr, min_t(unsigned int, ndev->addr_len, ETH_ALEN)); } else { /* * This device does not have a HW address, but * connection mangagement requires a unique gid. */ eth_random_addr(rxe->raw_gid); } addrconf_addr_eui48((unsigned char *)&rxe->attr.sys_image_guid, rxe->raw_gid); rxe->max_ucontext = RXE_MAX_UCONTEXT; if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) { rxe->attr.kernel_cap_flags |= IBK_ON_DEMAND_PAGING; /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */ rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT; rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND; rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV; rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_FLUSH; rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_ATOMIC_WRITE; /* set handler for ODP prefetching API - ibv_advise_mr(3) */ ib_set_device_ops(&rxe->ib_dev, &rxe_ib_dev_odp_ops); } } /* initialize port attributes */ static void rxe_init_port_param(struct rxe_port *port) { port->attr.state = IB_PORT_DOWN; port->attr.max_mtu = IB_MTU_4096; port->attr.active_mtu = IB_MTU_256; port->attr.gid_tbl_len = RXE_PORT_GID_TBL_LEN; port->attr.port_cap_flags = RXE_PORT_PORT_CAP_FLAGS; port->attr.max_msg_sz = RXE_PORT_MAX_MSG_SZ; port->attr.bad_pkey_cntr = RXE_PORT_BAD_PKEY_CNTR; port->attr.qkey_viol_cntr = RXE_PORT_QKEY_VIOL_CNTR; port->attr.pkey_tbl_len = RXE_PORT_PKEY_TBL_LEN; port->attr.lid = RXE_PORT_LID; port->attr.sm_lid = RXE_PORT_SM_LID; port->attr.lmc = RXE_PORT_LMC; port->attr.max_vl_num = RXE_PORT_MAX_VL_NUM; port->attr.sm_sl = RXE_PORT_SM_SL; port->attr.subnet_timeout = RXE_PORT_SUBNET_TIMEOUT; port->attr.init_type_reply = RXE_PORT_INIT_TYPE_REPLY; port->attr.active_width = RXE_PORT_ACTIVE_WIDTH; port->attr.active_speed = RXE_PORT_ACTIVE_SPEED; port->attr.phys_state = RXE_PORT_PHYS_STATE; port->mtu_cap = ib_mtu_enum_to_int(IB_MTU_256); port->subnet_prefix = cpu_to_be64(RXE_PORT_SUBNET_PREFIX); } /* initialize port state, note IB convention that HCA ports are always * numbered from 1 */ static void rxe_init_ports(struct rxe_dev *rxe, struct net_device *ndev) { struct rxe_port *port = &rxe->port; rxe_init_port_param(port); addrconf_addr_eui48((unsigned char *)&port->port_guid, rxe->raw_gid); spin_lock_init(&port->port_lock); } /* init pools of managed objects */ static void rxe_init_pools(struct rxe_dev *rxe) { rxe_pool_init(rxe, &rxe->uc_pool, RXE_TYPE_UC); rxe_pool_init(rxe, &rxe->pd_pool, RXE_TYPE_PD); rxe_pool_init(rxe, &rxe->ah_pool, RXE_TYPE_AH); rxe_pool_init(rxe, &rxe->srq_pool, RXE_TYPE_SRQ); rxe_pool_init(rxe, &rxe->qp_pool, RXE_TYPE_QP); rxe_pool_init(rxe, &rxe->cq_pool, RXE_TYPE_CQ); rxe_pool_init(rxe, &rxe->mr_pool, RXE_TYPE_MR); rxe_pool_init(rxe, &rxe->mw_pool, RXE_TYPE_MW); } /* initialize rxe device state */ static void rxe_init(struct rxe_dev *rxe, struct net_device *ndev) { /* init default device parameters */ rxe_init_device_param(rxe, ndev); rxe_init_ports(rxe, ndev); rxe_init_pools(rxe); /* init pending mmap list */ spin_lock_init(&rxe->mmap_offset_lock); spin_lock_init(&rxe->pending_lock); INIT_LIST_HEAD(&rxe->pending_mmaps); /* init multicast support */ spin_lock_init(&rxe->mcg_lock); rxe->mcg_tree = RB_ROOT; mutex_init(&rxe->usdev_lock); } void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) { struct rxe_port *port = &rxe->port; enum ib_mtu mtu; mtu = eth_mtu_int_to_enum(ndev_mtu); /* Make sure that new MTU in range */ mtu = mtu ? min_t(enum ib_mtu, mtu, IB_MTU_4096) : IB_MTU_256; port->attr.active_mtu = mtu; port->mtu_cap = ib_mtu_enum_to_int(mtu); } /* called by ifc layer to create new rxe device. * The caller should allocate memory for rxe by calling ib_alloc_device. */ int rxe_add(struct rxe_dev *rxe, unsigned int mtu, const char *ibdev_name, struct net_device *ndev) { rxe_init(rxe, ndev); rxe_set_mtu(rxe, mtu); return rxe_register_device(rxe, ibdev_name, ndev); } static int rxe_newlink(const char *ibdev_name, struct net_device *ndev) { struct rxe_dev *rxe; int err = 0; if (is_vlan_dev(ndev)) { rxe_err("rxe creation allowed on top of a real device only\n"); err = -EPERM; goto err; } rxe = rxe_get_dev_from_net(ndev); if (rxe) { ib_device_put(&rxe->ib_dev); rxe_err_dev(rxe, "already configured on %s\n", ndev->name); err = -EEXIST; goto err; } err = rxe_net_add(ibdev_name, ndev); if (err) { rxe_err("failed to add %s\n", ndev->name); goto err; } err: return err; } static struct rdma_link_ops rxe_link_ops = { .type = "rxe", .newlink = rxe_newlink, }; static int __init rxe_module_init(void) { int err; err = rxe_alloc_wq(); if (err) return err; err = rxe_net_init(); if (err) { rxe_destroy_wq(); return err; } rdma_link_register(&rxe_link_ops); pr_info("loaded\n"); return 0; } static void __exit rxe_module_exit(void) { rdma_link_unregister(&rxe_link_ops); ib_unregister_driver(RDMA_DRIVER_RXE); rxe_net_exit(); rxe_destroy_wq(); pr_info("unloaded\n"); } late_initcall(rxe_module_init); module_exit(rxe_module_exit); MODULE_ALIAS_RDMA_LINK("rxe");
82 151 16 21 16 32 662 6 6 1030 1030 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 /* SPDX-License-Identifier: GPL-2.0-only */ /* include/net/xdp.h * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ #include <linux/bitfield.h> #include <linux/filter.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* skb_shared_info */ #include <net/page_pool/types.h> /** * DOC: XDP RX-queue information * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how * the driver has configured a given RX-ring queue. * * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP * data-path read-access to RX-info for both kernel and bpf-side * (limited subset). * * For now, direct access is only safe while running in NAPI/softirq * context. Contents are read-mostly and must not be updated during * driver NAPI/softirq poll. * * The driver usage API is a register and unregister API. * * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver * naturally needs to stop the RX-ring before purging and reallocating * memory. In that process the driver MUST call unregister (which * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. */ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; /* XDP flags for ndo_xdp_xmit */ #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ u32 id; }; struct page_pool; struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; struct xdp_mem_info mem; u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { struct net_device *dev; }; enum xdp_buff_flags { XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under * pressure */ /* frags have unreadable mem, this can't be true for real XDP packets, * but drivers may use XDP helpers to construct Rx pkt state even when * XDP program is not attached. */ XDP_FLAGS_FRAGS_UNREADABLE = BIT(2), }; struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; union { struct { /* frame size to deduce data_hard_end/tailroom */ u32 frame_sz; /* supported values defined in xdp_buff_flags */ u32 flags; }; #ifdef __LITTLE_ENDIAN /* Used to micro-optimize xdp_init_buff(), don't use directly */ u64 frame_sz_flags_init; #endif }; }; static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_HAS_FRAGS; } static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) { xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } static __always_inline void xdp_buff_set_frag_unreadable(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_FRAGS_UNREADABLE; } static __always_inline u32 xdp_buff_get_skb_flags(const struct xdp_buff *xdp) { return xdp->flags; } static __always_inline void xdp_buff_clear_frag_pfmemalloc(struct xdp_buff *xdp) { xdp->flags &= ~XDP_FLAGS_FRAGS_PF_MEMALLOC; } static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { xdp->rxq = rxq; #ifdef __LITTLE_ENDIAN /* * Force the compilers to initialize ::flags and assign ::frame_sz with * one write on 64-bit LE architectures as they're often unable to do * it themselves. */ xdp->frame_sz_flags_init = frame_sz; #else xdp->frame_sz = frame_sz; xdp->flags = 0; #endif } static __always_inline void xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, int headroom, int data_len, const bool meta_valid) { unsigned char *data = hard_start + headroom; xdp->data_hard_start = hard_start; xdp->data = data; xdp->data_end = data + data_len; xdp->data_meta = meta_valid ? data : data + 1; } /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the * XDP/BPF data access to data_hard_end. Notice same area (and size) * is used for XDP_PASS, when constructing the SKB via build_skb(). */ #define xdp_data_hard_end(xdp) \ ((xdp)->data_hard_start + (xdp)->frame_sz - \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * xdp_get_shared_info_from_buff(const struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } static __always_inline unsigned int xdp_get_buff_len(const struct xdp_buff *xdp) { unsigned int len = xdp->data_end - xdp->data; const struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); len += sinfo->xdp_frags_size; out: return len; } void xdp_return_frag(netmem_ref netmem, const struct xdp_buff *xdp); /** * __xdp_buff_add_frag - attach frag to &xdp_buff * @xdp: XDP buffer to attach the frag to * @netmem: network memory containing the frag * @offset: offset at which the frag starts * @size: size of the frag * @truesize: total memory size occupied by the frag * @try_coalesce: whether to try coalescing the frags (not valid for XSk) * * Attach frag to the XDP buffer. If it currently has no frags attached, * initialize the related fields, otherwise check that the frag number * didn't reach the limit of ``MAX_SKB_FRAGS``. If possible, try coalescing * the frag with the previous one. * The function doesn't check/update the pfmemalloc bit. Please use the * non-underscored wrapper in drivers. * * Return: true on success, false if there's no space for the frag in * the shared info struct. */ static inline bool __xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, u32 offset, u32 size, u32 truesize, bool try_coalesce) { struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); skb_frag_t *prev; u32 nr_frags; if (!xdp_buff_has_frags(xdp)) { xdp_buff_set_frags_flag(xdp); nr_frags = 0; sinfo->xdp_frags_size = 0; sinfo->xdp_frags_truesize = 0; goto fill; } nr_frags = sinfo->nr_frags; prev = &sinfo->frags[nr_frags - 1]; if (try_coalesce && netmem == skb_frag_netmem(prev) && offset == skb_frag_off(prev) + skb_frag_size(prev)) { skb_frag_size_add(prev, size); /* Guaranteed to only decrement the refcount */ xdp_return_frag(netmem, xdp); } else if (unlikely(nr_frags == MAX_SKB_FRAGS)) { return false; } else { fill: __skb_fill_netmem_desc_noacc(sinfo, nr_frags++, netmem, offset, size); } sinfo->nr_frags = nr_frags; sinfo->xdp_frags_size += size; sinfo->xdp_frags_truesize += truesize; return true; } /** * xdp_buff_add_frag - attach frag to &xdp_buff * @xdp: XDP buffer to attach the frag to * @netmem: network memory containing the frag * @offset: offset at which the frag starts * @size: size of the frag * @truesize: total memory size occupied by the frag * * Version of __xdp_buff_add_frag() which takes care of the pfmemalloc bit. * * Return: true on success, false if there's no space for the frag in * the shared info struct. */ static inline bool xdp_buff_add_frag(struct xdp_buff *xdp, netmem_ref netmem, u32 offset, u32 size, u32 truesize) { if (!__xdp_buff_add_frag(xdp, netmem, offset, size, truesize, true)) return false; if (unlikely(netmem_is_pfmemalloc(netmem))) xdp_buff_set_frag_pfmemalloc(xdp); if (unlikely(netmem_is_net_iov(netmem))) xdp_buff_set_frag_unreadable(xdp); return true; } struct xdp_frame { void *data; u32 len; u32 headroom; u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem_type is valid on remote CPU. */ enum xdp_mem_type mem_type:32; struct net_device *dev_rx; /* used by cpumap */ u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_frame_has_frags(const struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline u32 xdp_frame_get_skb_flags(const struct xdp_frame *frame) { return frame->flags; } #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; netmem_ref q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) { bq->count = 0; } static inline struct skb_shared_info * xdp_get_shared_info_from_frame(const struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); return (struct skb_shared_info *)(data_hard_start + frame->frame_sz - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); } struct xdp_cpumap_stats { unsigned int redirect; unsigned int pass; unsigned int drop; }; /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { frame->data = NULL; frame->dev_rx = NULL; } static inline void xdp_update_skb_frags_info(struct sk_buff *skb, u8 nr_frags, unsigned int size, unsigned int truesize, u32 xdp_flags) { struct skb_shared_info *sinfo = skb_shinfo(skb); sinfo->nr_frags = nr_frags; /* * ``destructor_arg`` is unionized with ``xdp_frags_{,true}size``, * reset it after that these fields aren't used anymore. */ sinfo->destructor_arg = NULL; skb->len += size; skb->data_len += size; skb->truesize += truesize; skb->pfmemalloc |= !!(xdp_flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); skb->unreadable |= !!(xdp_flags & XDP_FLAGS_FRAGS_UNREADABLE); } /* Avoids inlining WARN macro in fast-path */ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp); struct sk_buff *xdp_build_skb_from_zc(struct xdp_buff *xdp); struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline void xdp_convert_frame_to_buff(const struct xdp_frame *frame, struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; xdp->data_end = frame->data + frame->len; xdp->data_meta = frame->data - frame->metasize; xdp->frame_sz = frame->frame_sz; xdp->flags = frame->flags; } static inline int xdp_update_frame_from_buff(const struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; /* Assure headroom is available for storing info */ headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; metasize = metasize > 0 ? metasize : 0; if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) return -ENOSPC; /* Catch if driver didn't reserve tailroom for skb_shared_info */ if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { XDP_WARN("Driver BUG: missing reserved tailroom"); return -ENOSPC; } xdp_frame->data = xdp->data; xdp_frame->len = xdp->data_end - xdp->data; xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; xdp_frame->frame_sz = xdp->frame_sz; xdp_frame->flags = xdp->flags; return 0; } /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) { struct xdp_frame *xdp_frame; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Store info in top of packet */ xdp_frame = xdp->data_hard_start; if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; /* rxq only valid until napi_schedule ends, convert to xdp_mem_type */ xdp_frame->mem_type = xdp->rxq->mem.type; return xdp_frame; } void __xdp_return(netmem_ref netmem, enum xdp_mem_type mem_type, bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); static inline void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq) { if (unlikely(!bq->count)) return; page_pool_put_netmem_bulk(bq->q, bq->count); bq->count = 0; } static __always_inline unsigned int xdp_get_frame_len(const struct xdp_frame *xdpf) { const struct skb_shared_info *sinfo; unsigned int len = xdpf->len; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); len += sinfo->xdp_frags_size; out: return len; } int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id, u32 frag_size); static inline int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id) { return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0); } void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator); void xdp_unreg_mem_model(struct xdp_mem_info *mem); int xdp_reg_page_pool(struct page_pool *pool); void xdp_unreg_page_pool(const struct page_pool *pool); void xdp_rxq_info_attach_page_pool(struct xdp_rxq_info *xdp_rxq, const struct page_pool *pool); /** * xdp_rxq_info_attach_mem_model - attach registered mem info to RxQ info * @xdp_rxq: XDP RxQ info to attach the memory info to * @mem: already registered memory info * * If the driver registers its memory providers manually, it must use this * function instead of xdp_rxq_info_reg_mem_model(). */ static inline void xdp_rxq_info_attach_mem_model(struct xdp_rxq_info *xdp_rxq, const struct xdp_mem_info *mem) { xdp_rxq->mem = *mem; } /** * xdp_rxq_info_detach_mem_model - detach registered mem info from RxQ info * @xdp_rxq: XDP RxQ info to detach the memory info from * * If the driver registers its memory providers manually and then attaches it * via xdp_rxq_info_attach_mem_model(), it must call this function before * xdp_rxq_info_unreg(). */ static inline void xdp_rxq_info_detach_mem_model(struct xdp_rxq_info *xdp_rxq) { xdp_rxq->mem = (struct xdp_mem_info){ }; } /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. */ static __always_inline void xdp_set_data_meta_invalid(struct xdp_buff *xdp) { xdp->data_meta = xdp->data + 1; } static __always_inline bool xdp_data_meta_unsupported(const struct xdp_buff *xdp) { return unlikely(xdp->data_meta > xdp->data); } static inline bool xdp_metalen_invalid(unsigned long metalen) { unsigned long meta_max; meta_max = type_max(typeof_member(struct skb_shared_info, meta_len)); BUILD_BUG_ON(!__builtin_constant_p(meta_max)); return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max; } struct xdp_attachment_info { struct bpf_prog *prog; u32 flags; }; struct netdev_bpf; void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE /* Define the relationship between xdp-rx-metadata kfunc and * various other entities: * - xdp_rx_metadata enum * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml) * - kfunc name * - xdp_metadata_ops field */ #define XDP_METADATA_KFUNC_xxx \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ NETDEV_XDP_RX_METADATA_TIMESTAMP, \ bpf_xdp_metadata_rx_timestamp, \ xmo_rx_timestamp) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \ NETDEV_XDP_RX_METADATA_VLAN_TAG, \ bpf_xdp_metadata_rx_vlan_tag, \ xmo_rx_vlan_tag) \ enum xdp_rx_metadata { #define XDP_METADATA_KFUNC(name, _, __, ___) name, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC MAX_XDP_METADATA_KFUNC, }; enum xdp_rss_hash_type { /* First part: Individual bits for L3/L4 types */ XDP_RSS_L3_IPV4 = BIT(0), XDP_RSS_L3_IPV6 = BIT(1), /* The fixed (L3) IPv4 and IPv6 headers can both be followed by * variable/dynamic headers, IPv4 called Options and IPv6 called * Extension Headers. HW RSS type can contain this info. */ XDP_RSS_L3_DYNHDR = BIT(2), /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in * addition to the protocol specific bit. This ease interaction with * SKBs and avoids reserving a fixed mask for future L4 protocol bits. */ XDP_RSS_L4 = BIT(3), /* L4 based hash, proto can be unknown */ XDP_RSS_L4_TCP = BIT(4), XDP_RSS_L4_UDP = BIT(5), XDP_RSS_L4_SCTP = BIT(6), XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ XDP_RSS_L4_ICMP = BIT(8), /* Second part: RSS hash type combinations used for driver HW mapping */ XDP_RSS_TYPE_NONE = 0, XDP_RSS_TYPE_L2 = XDP_RSS_TYPE_NONE, XDP_RSS_TYPE_L3_IPV4 = XDP_RSS_L3_IPV4, XDP_RSS_TYPE_L3_IPV6 = XDP_RSS_L3_IPV6, XDP_RSS_TYPE_L3_IPV4_OPT = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L3_IPV6_EX = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_ANY = XDP_RSS_L4, XDP_RSS_TYPE_L4_IPV4_TCP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV4_ICMP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV6_ICMP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR, }; struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type); int (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto, u16 *vlan_tci); }; #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); void xdp_set_features_flag_locked(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); void xdp_features_set_redirect_target_locked(struct net_device *dev, bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); void xdp_features_clear_redirect_target_locked(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } static inline void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) { } static inline void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) { } static inline void xdp_features_clear_redirect_target(struct net_device *dev) { } #endif static inline void xdp_clear_features_flag(struct net_device *dev) { xdp_set_features_flag(dev, 0); } static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus * under local_bh_disable(), which provides the needed RCU protection * for accessing map entries. */ u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) act = xdp_master_redirect(xdp); } return act; } #endif /* __LINUX_NET_XDP_H__ */
76 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ALSA sequencer FIFO * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> */ #ifndef __SND_SEQ_FIFO_H #define __SND_SEQ_FIFO_H #include "seq_memory.h" #include "seq_lock.h" /* === FIFO === */ struct snd_seq_fifo { struct snd_seq_pool *pool; /* FIFO pool */ struct snd_seq_event_cell *head; /* pointer to head of fifo */ struct snd_seq_event_cell *tail; /* pointer to tail of fifo */ int cells; spinlock_t lock; snd_use_lock_t use_lock; wait_queue_head_t input_sleep; atomic_t overflow; }; /* create new fifo (constructor) */ struct snd_seq_fifo *snd_seq_fifo_new(int poolsize); /* delete fifo (destructor) */ void snd_seq_fifo_delete(struct snd_seq_fifo **f); /* enqueue event to fifo */ int snd_seq_fifo_event_in(struct snd_seq_fifo *f, struct snd_seq_event *event); /* lock fifo from release */ #define snd_seq_fifo_lock(fifo) snd_use_lock_use(&(fifo)->use_lock) #define snd_seq_fifo_unlock(fifo) snd_use_lock_free(&(fifo)->use_lock) DEFINE_GUARD(snd_seq_fifo, struct snd_seq_fifo *, snd_seq_fifo_lock(_T), snd_seq_fifo_unlock(_T)) /* get a cell from fifo - fifo should be locked */ int snd_seq_fifo_cell_out(struct snd_seq_fifo *f, struct snd_seq_event_cell **cellp, int nonblock); /* free dequeued cell - fifo should be locked */ void snd_seq_fifo_cell_putback(struct snd_seq_fifo *f, struct snd_seq_event_cell *cell); /* clean up queue */ void snd_seq_fifo_clear(struct snd_seq_fifo *f); /* polling */ int snd_seq_fifo_poll_wait(struct snd_seq_fifo *f, struct file *file, poll_table *wait); /* resize pool in fifo */ int snd_seq_fifo_resize(struct snd_seq_fifo *f, int poolsize); /* get the number of unused cells safely */ int snd_seq_fifo_unused_cells(struct snd_seq_fifo *f); #endif
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved * Copyright 2001-2006 Ian Kent <raven@themaw.net> */ #include <linux/sched/signal.h> #include "autofs_i.h" /* We make this a static variable rather than a part of the superblock; it * is better if we don't reassign numbers easily even across filesystems */ static autofs_wqt_t autofs_next_wait_queue = 1; void autofs_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; mutex_lock(&sbi->wq_mutex); if (sbi->flags & AUTOFS_SBI_CATATONIC) { mutex_unlock(&sbi->wq_mutex); return; } pr_debug("entering catatonic mode\n"); sbi->flags |= AUTOFS_SBI_CATATONIC; wq = sbi->queues; sbi->queues = NULL; /* Erase all wait queues */ while (wq) { nwq = wq->next; wq->status = -ENOENT; /* Magic is gone - report failure */ kfree(wq->name.name - wq->offset); wq->name.name = NULL; wake_up(&wq->queue); if (!--wq->wait_ctr) kfree(wq); wq = nwq; } fput(sbi->pipe); /* Close the pipe */ sbi->pipe = NULL; sbi->pipefd = -1; mutex_unlock(&sbi->wq_mutex); } static int autofs_write(struct autofs_sb_info *sbi, struct file *file, const void *addr, int bytes) { unsigned long sigpipe, flags; const char *data = (const char *)addr; ssize_t wr = 0; sigpipe = sigismember(&current->pending.signal, SIGPIPE); mutex_lock(&sbi->pipe_mutex); while (bytes) { wr = __kernel_write(file, data, bytes, NULL); if (wr <= 0) break; data += wr; bytes -= wr; } mutex_unlock(&sbi->pipe_mutex); /* Keep the currently executing process from receiving a * SIGPIPE unless it was already supposed to get one */ if (wr == -EPIPE && !sigpipe) { spin_lock_irqsave(&current->sighand->siglock, flags); sigdelset(&current->pending.signal, SIGPIPE); recalc_sigpending(); spin_unlock_irqrestore(&current->sighand->siglock, flags); } /* if 'wr' returned 0 (impossible) we assume -EIO (safe) */ return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; } static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) { union { struct autofs_packet_hdr hdr; union autofs_packet_union v4_pkt; union autofs_v5_packet_union v5_pkt; } pkt; struct file *pipe = NULL; size_t pktsz; int ret; pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, type); memset(&pkt, 0, sizeof(pkt)); /* For security reasons */ pkt.hdr.proto_version = sbi->version; pkt.hdr.type = type; switch (type) { /* Kernel protocol v4 missing and expire packets */ case autofs_ptype_missing: { struct autofs_packet_missing *mp = &pkt.v4_pkt.missing; pktsz = sizeof(*mp); mp->wait_queue_token = wq->wait_queue_token; mp->len = wq->name.len; memcpy(mp->name, wq->name.name, wq->name.len); mp->name[wq->name.len] = '\0'; break; } case autofs_ptype_expire_multi: { struct autofs_packet_expire_multi *ep = &pkt.v4_pkt.expire_multi; pktsz = sizeof(*ep); ep->wait_queue_token = wq->wait_queue_token; ep->len = wq->name.len; memcpy(ep->name, wq->name.name, wq->name.len); ep->name[wq->name.len] = '\0'; break; } /* * Kernel protocol v5 packet for handling indirect and direct * mount missing and expire requests */ case autofs_ptype_missing_indirect: case autofs_ptype_expire_indirect: case autofs_ptype_missing_direct: case autofs_ptype_expire_direct: { struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; pktsz = sizeof(*packet); packet->wait_queue_token = wq->wait_queue_token; packet->len = wq->name.len; memcpy(packet->name, wq->name.name, wq->name.len); packet->name[wq->name.len] = '\0'; packet->dev = wq->dev; packet->ino = wq->ino; packet->uid = from_kuid_munged(user_ns, wq->uid); packet->gid = from_kgid_munged(user_ns, wq->gid); packet->pid = wq->pid; packet->tgid = wq->tgid; break; } default: pr_warn("bad type %d!\n", type); mutex_unlock(&sbi->wq_mutex); return; } pipe = get_file(sbi->pipe); mutex_unlock(&sbi->wq_mutex); switch (ret = autofs_write(sbi, pipe, &pkt, pktsz)) { case 0: break; case -ENOMEM: case -ERESTARTSYS: /* Just fail this one */ autofs_wait_release(sbi, wq->wait_queue_token, ret); break; default: autofs_catatonic_mode(sbi); break; } fput(pipe); } static struct autofs_wait_queue * autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) { struct autofs_wait_queue *wq; for (wq = sbi->queues; wq; wq = wq->next) { if (wq->name.hash == qstr->hash && wq->name.len == qstr->len && wq->name.name && !memcmp(wq->name.name, qstr->name, qstr->len)) break; } return wq; } /* * Check if we have a valid request. * Returns * 1 if the request should continue. * In this case we can return an autofs_wait_queue entry if one is * found or NULL to idicate a new wait needs to be created. * 0 or a negative errno if the request shouldn't continue. */ static int validate_request(struct autofs_wait_queue **wait, struct autofs_sb_info *sbi, const struct qstr *qstr, const struct path *path, enum autofs_notify notify) { struct dentry *dentry = path->dentry; struct autofs_wait_queue *wq; struct autofs_info *ino; if (sbi->flags & AUTOFS_SBI_CATATONIC) return -ENOENT; /* Wait in progress, continue; */ wq = autofs_find_wait(sbi, qstr); if (wq) { *wait = wq; return 1; } *wait = NULL; /* If we don't yet have any info this is a new request */ ino = autofs_dentry_ino(dentry); if (!ino) return 1; /* * If we've been asked to wait on an existing expire (NFY_NONE) * but there is no wait in the queue ... */ if (notify == NFY_NONE) { /* * Either we've betean the pending expire to post it's * wait or it finished while we waited on the mutex. * So we need to wait till either, the wait appears * or the expire finishes. */ while (ino->flags & AUTOFS_INF_EXPIRING) { mutex_unlock(&sbi->wq_mutex); schedule_timeout_interruptible(HZ/10); if (mutex_lock_interruptible(&sbi->wq_mutex)) return -EINTR; if (sbi->flags & AUTOFS_SBI_CATATONIC) return -ENOENT; wq = autofs_find_wait(sbi, qstr); if (wq) { *wait = wq; return 1; } } /* * Not ideal but the status has already gone. Of the two * cases where we wait on NFY_NONE neither depend on the * return status of the wait. */ return 0; } /* * If we've been asked to trigger a mount and the request * completed while we waited on the mutex ... */ if (notify == NFY_MOUNT) { struct dentry *new = NULL; struct path this; int valid = 1; /* * If the dentry was successfully mounted while we slept * on the wait queue mutex we can return success. If it * isn't mounted (doesn't have submounts for the case of * a multi-mount with no mount at it's base) we can * continue on and create a new request. */ if (!IS_ROOT(dentry)) { if (d_unhashed(dentry) && d_really_is_positive(dentry)) { struct dentry *parent = dentry->d_parent; new = d_lookup(parent, &dentry->d_name); if (new) dentry = new; } } this.mnt = path->mnt; this.dentry = dentry; if (path_has_submounts(&this)) valid = 0; if (new) dput(new); return valid; } return 1; } int autofs_wait(struct autofs_sb_info *sbi, const struct path *path, enum autofs_notify notify) { struct dentry *dentry = path->dentry; struct autofs_wait_queue *wq; struct qstr qstr; char *name; int status, ret, type; unsigned int offset = 0; pid_t pid; pid_t tgid; /* In catatonic mode, we don't wait for nobody */ if (sbi->flags & AUTOFS_SBI_CATATONIC) return -ENOENT; /* * Try translating pids to the namespace of the daemon. * * Zero means failure: we are in an unrelated pid namespace. */ pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); if (pid == 0 || tgid == 0) return -ENOENT; if (d_really_is_negative(dentry)) { /* * A wait for a negative dentry is invalid for certain * cases. A direct or offset mount "always" has its mount * point directory created and so the request dentry must * be positive or the map key doesn't exist. The situation * is very similar for indirect mounts except only dentrys * in the root of the autofs file system may be negative. */ if (autofs_type_trigger(sbi->type)) return -ENOENT; else if (!IS_ROOT(dentry->d_parent)) return -ENOENT; } name = kmalloc(NAME_MAX + 1, GFP_KERNEL); if (!name) return -ENOMEM; /* If this is a direct mount request create a dummy name */ if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) { qstr.name = name; qstr.len = sprintf(name, "%p", dentry); } else { char *p = dentry_path_raw(dentry, name, NAME_MAX); if (IS_ERR(p)) { kfree(name); return -ENOENT; } qstr.name = ++p; // skip the leading slash qstr.len = strlen(p); offset = p - name; } qstr.hash = full_name_hash(dentry, qstr.name, qstr.len); if (mutex_lock_interruptible(&sbi->wq_mutex)) { kfree(name); return -EINTR; } ret = validate_request(&wq, sbi, &qstr, path, notify); if (ret <= 0) { if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); kfree(name); return ret; } if (!wq) { /* Create a new wait queue */ wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL); if (!wq) { kfree(name); mutex_unlock(&sbi->wq_mutex); return -ENOMEM; } wq->wait_queue_token = autofs_next_wait_queue; if (++autofs_next_wait_queue == 0) autofs_next_wait_queue = 1; wq->next = sbi->queues; sbi->queues = wq; init_waitqueue_head(&wq->queue); memcpy(&wq->name, &qstr, sizeof(struct qstr)); wq->offset = offset; wq->dev = autofs_get_dev(sbi); wq->ino = autofs_get_ino(sbi); wq->uid = current_uid(); wq->gid = current_gid(); wq->pid = pid; wq->tgid = tgid; wq->status = -EINTR; /* Status return if interrupted */ wq->wait_ctr = 2; if (sbi->version < 5) { if (notify == NFY_MOUNT) type = autofs_ptype_missing; else type = autofs_ptype_expire_multi; } else { if (notify == NFY_MOUNT) type = autofs_type_trigger(sbi->type) ? autofs_ptype_missing_direct : autofs_ptype_missing_indirect; else type = autofs_type_trigger(sbi->type) ? autofs_ptype_expire_direct : autofs_ptype_expire_indirect; } pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); /* * autofs_notify_daemon() may block; it will unlock ->wq_mutex */ autofs_notify_daemon(sbi, wq, type); } else { wq->wait_ctr++; pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); mutex_unlock(&sbi->wq_mutex); kfree(name); } /* * wq->name.name is NULL iff the lock is already released * or the mount has been made catatonic. */ wait_event_killable(wq->queue, wq->name.name == NULL); status = wq->status; /* * For direct and offset mounts we need to track the requester's * uid and gid in the dentry info struct. This is so it can be * supplied, on request, by the misc device ioctl interface. * This is needed during daemon resatart when reconnecting * to existing, active, autofs mounts. The uid and gid (and * related string values) may be used for macro substitution * in autofs mount maps. */ if (!status) { struct autofs_info *ino; struct dentry *de = NULL; /* direct mount or browsable map */ ino = autofs_dentry_ino(dentry); if (!ino) { /* If not lookup actual dentry used */ de = d_lookup(dentry->d_parent, &dentry->d_name); if (de) ino = autofs_dentry_ino(de); } /* Set mount requester */ if (ino) { spin_lock(&sbi->fs_lock); ino->uid = wq->uid; ino->gid = wq->gid; spin_unlock(&sbi->fs_lock); } if (de) dput(de); } /* Are we the last process to need status? */ mutex_lock(&sbi->wq_mutex); if (!--wq->wait_ctr) kfree(wq); mutex_unlock(&sbi->wq_mutex); return status; } int autofs_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status) { struct autofs_wait_queue *wq, **wql; mutex_lock(&sbi->wq_mutex); for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) { if (wq->wait_queue_token == wait_queue_token) break; } if (!wq) { mutex_unlock(&sbi->wq_mutex); return -EINVAL; } *wql = wq->next; /* Unlink from chain */ kfree(wq->name.name - wq->offset); wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; wake_up(&wq->queue); if (!--wq->wait_ctr) kfree(wq); mutex_unlock(&sbi->wq_mutex); return 0; }
2337 2339 2337 2339 1510 1510 1508 1509 2980 2981 2361 2362 2362 2363 2362 2362 2985 2985 2363 2985 1510 1510 1507 1507 2965 2967 2348 2347 2348 2074 522 2986 2985 2983 2348 2982 1508 1300 227 1439 1439 33 31 31 759 3058 3058 1430 1431 3091 2337 1437 1438 1439 2338 2337 2336 12 12 416 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 // SPDX-License-Identifier: GPL-2.0-or-later /* * net-sysfs.c - network device class and attributes * * Copyright (c) 2003 Stephen Hemminger <shemminger@osdl.org> */ #include <linux/capability.h> #include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/sched/isolation.h> #include <linux/nsproxy.h> #include <net/sock.h> #include <net/net_namespace.h> #include <linux/rtnetlink.h> #include <linux/vmalloc.h> #include <linux/export.h> #include <linux/jiffies.h> #include <linux/pm_runtime.h> #include <linux/of.h> #include <linux/of_net.h> #include <linux/cpu.h> #include <net/netdev_lock.h> #include <net/netdev_rx_queue.h> #include <net/rps.h> #include "dev.h" #include "net-sysfs.h" #ifdef CONFIG_SYSFS static const char fmt_hex[] = "%#x\n"; static const char fmt_dec[] = "%d\n"; static const char fmt_uint[] = "%u\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; /* Caller holds RTNL, netdev->lock or RCU */ static inline int dev_isalive(const struct net_device *dev) { return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; } /* There is a possible ABBA deadlock between rtnl_lock and kernfs_node->active, * when unregistering a net device and accessing associated sysfs files. The * potential deadlock is as follow: * * CPU 0 CPU 1 * * rtnl_lock vfs_read * unregister_netdevice_many kernfs_seq_start * device_del / kobject_put kernfs_get_active (kn->active++) * kernfs_drain sysfs_kf_seq_show * wait_event( rtnl_lock * kn->active == KN_DEACTIVATED_BIAS) -> waits on CPU 0 to release * -> waits on CPU 1 to decrease kn->active the rtnl lock. * * The historical fix was to use rtnl_trylock with restart_syscall to bail out * of sysfs operations when the lock couldn't be taken. This fixed the above * issue as it allowed CPU 1 to bail out of the ABBA situation. * * But it came with performances issues, as syscalls are being restarted in * loops when there was contention on the rtnl lock, with huge slow downs in * specific scenarios (e.g. lots of virtual interfaces created and userspace * daemons querying their attributes). * * The idea below is to bail out of the active kernfs_node protection * (kn->active) while trying to take the rtnl lock. * * This replaces rtnl_lock() and still has to be used with rtnl_unlock(). The * net device is guaranteed to be alive if this returns successfully. */ static int sysfs_rtnl_lock(struct kobject *kobj, struct attribute *attr, struct net_device *ndev) { struct kernfs_node *kn; int ret = 0; /* First, we hold a reference to the net device as the unregistration * path might run in parallel. This will ensure the net device and the * associated sysfs objects won't be freed while we try to take the rtnl * lock. */ dev_hold(ndev); /* sysfs_break_active_protection was introduced to allow self-removal of * devices and their associated sysfs files by bailing out of the * sysfs/kernfs protection. We do this here to allow the unregistration * path to complete in parallel. The following takes a reference on the * kobject and the kernfs_node being accessed. * * This works because we hold a reference onto the net device and the * unregistration path will wait for us eventually in netdev_run_todo * (outside an rtnl lock section). */ kn = sysfs_break_active_protection(kobj, attr); /* We can now try to take the rtnl lock. This can't deadlock us as the * unregistration path is able to drain sysfs files (kernfs_node) thanks * to the above dance. */ if (rtnl_lock_interruptible()) { ret = -ERESTARTSYS; goto unbreak; } /* Check dismantle on the device hasn't started, otherwise deny the * operation. */ if (!dev_isalive(ndev)) { rtnl_unlock(); ret = -ENODEV; goto unbreak; } /* We are now sure the device dismantle hasn't started nor that it can * start before we exit the locking section as we hold the rtnl lock. * There's no need to keep unbreaking the sysfs protection nor to hold * a net device reference from that point; that was only needed to take * the rtnl lock. */ unbreak: sysfs_unbreak_active_protection(kn); dev_put(ndev); return ret; } /* use same locking rules as GIF* ioctl's */ static ssize_t netdev_show(const struct device *dev, struct device_attribute *attr, char *buf, ssize_t (*format)(const struct net_device *, char *)) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; rcu_read_lock(); if (dev_isalive(ndev)) ret = (*format)(ndev, buf); rcu_read_unlock(); return ret; } /* generate a show function for simple field */ #define NETDEVICE_SHOW(field, format_string) \ static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ return sysfs_emit(buf, format_string, READ_ONCE(dev->field)); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ return netdev_show(dev, attr, buf, format_##field); \ } \ #define NETDEVICE_SHOW_RO(field, format_string) \ NETDEVICE_SHOW(field, format_string); \ static DEVICE_ATTR_RO(field) #define NETDEVICE_SHOW_RW(field, format_string) \ NETDEVICE_SHOW(field, format_string); \ static DEVICE_ATTR_RW(field) /* use same locking and permission rules as SIF* ioctl's */ static ssize_t netdev_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len, int (*set)(struct net_device *, unsigned long)) { struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); unsigned long new; int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = kstrtoul(buf, 0, &new); if (ret) goto err; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) goto err; ret = (*set)(netdev, new); if (ret == 0) ret = len; rtnl_unlock(); err: return ret; } /* Same as netdev_store() but takes netdev_lock() instead of rtnl_lock() */ static ssize_t netdev_lock_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len, int (*set)(struct net_device *, unsigned long)) { struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); unsigned long new; int ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; ret = kstrtoul(buf, 0, &new); if (ret) return ret; netdev_lock(netdev); if (dev_isalive(netdev)) { ret = (*set)(netdev, new); if (ret == 0) ret = len; } netdev_unlock(netdev); return ret; } NETDEVICE_SHOW_RO(dev_id, fmt_hex); NETDEVICE_SHOW_RO(dev_port, fmt_dec); NETDEVICE_SHOW_RO(addr_assign_type, fmt_dec); NETDEVICE_SHOW_RO(addr_len, fmt_dec); NETDEVICE_SHOW_RO(ifindex, fmt_dec); NETDEVICE_SHOW_RO(type, fmt_dec); NETDEVICE_SHOW_RO(link_mode, fmt_dec); static ssize_t iflink_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); return sysfs_emit(buf, fmt_dec, dev_get_iflink(ndev)); } static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type)); } static ssize_t name_assign_type_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN) ret = netdev_show(dev, attr, buf, format_name_assign_type); return ret; } static DEVICE_ATTR_RO(name_assign_type); /* use same locking rules as GIFHWADDR ioctl's (netif_get_mac_address()) */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; down_read(&dev_addr_sem); rcu_read_lock(); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); rcu_read_unlock(); up_read(&dev_addr_sem); return ret; } static DEVICE_ATTR_RO(address); static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); int ret = -EINVAL; rcu_read_lock(); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); rcu_read_unlock(); return ret; } static DEVICE_ATTR_RO(broadcast); static int change_carrier(struct net_device *dev, unsigned long new_carrier) { if (!netif_running(dev)) return -EINVAL; return dev_change_carrier(dev, (bool)new_carrier); } static ssize_t carrier_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); /* The check is also done in change_carrier; this helps returning early * without hitting the locking section in netdev_store. */ if (!netdev->netdev_ops->ndo_change_carrier) return -EOPNOTSUPP; return netdev_store(dev, attr, buf, len, change_carrier); } static ssize_t carrier_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = -EINVAL; if (netif_running(netdev)) { /* Synchronize carrier state with link watch, * see also rtnl_getlink(). */ linkwatch_sync_dev(netdev); ret = sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); } rtnl_unlock(); return ret; } static DEVICE_ATTR_RW(carrier); static ssize_t speed_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = -EINVAL; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) ret = sysfs_emit(buf, fmt_dec, cmd.base.speed); } rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(speed); static ssize_t duplex_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); int ret = -EINVAL; /* The check is also done in __ethtool_get_link_ksettings; this helps * returning early without hitting the locking section below. */ if (!netdev->ethtool_ops->get_link_ksettings) return ret; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = -EINVAL; if (netif_running(netdev)) { struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) { const char *duplex; switch (cmd.base.duplex) { case DUPLEX_HALF: duplex = "half"; break; case DUPLEX_FULL: duplex = "full"; break; default: duplex = "unknown"; break; } ret = sysfs_emit(buf, "%s\n", duplex); } } rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(duplex); static ssize_t testing_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) return sysfs_emit(buf, fmt_dec, !!netif_testing(netdev)); return -EINVAL; } static DEVICE_ATTR_RO(testing); static ssize_t dormant_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) return sysfs_emit(buf, fmt_dec, !!netif_dormant(netdev)); return -EINVAL; } static DEVICE_ATTR_RO(dormant); static const char *const operstates[] = { "unknown", "notpresent", /* currently unused */ "down", "lowerlayerdown", "testing", "dormant", "up" }; static ssize_t operstate_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); unsigned char operstate; operstate = READ_ONCE(netdev->operstate); if (!netif_running(netdev)) operstate = IF_OPER_DOWN; if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ return sysfs_emit(buf, "%s\n", operstates[operstate]); } static DEVICE_ATTR_RO(operstate); static ssize_t carrier_changes_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count) + atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_changes); static ssize_t carrier_up_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); } static DEVICE_ATTR_RO(carrier_up_count); static ssize_t carrier_down_count_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_down_count); /* read-write attributes */ static int change_mtu(struct net_device *dev, unsigned long new_mtu) { return dev_set_mtu(dev, (int)new_mtu); } static ssize_t mtu_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_mtu); } NETDEVICE_SHOW_RW(mtu, fmt_dec); static int change_flags(struct net_device *dev, unsigned long new_flags) { return dev_change_flags(dev, (unsigned int)new_flags, NULL); } static ssize_t flags_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_flags); } NETDEVICE_SHOW_RW(flags, fmt_hex); static ssize_t tx_queue_len_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len); } NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { netdev_set_gro_flush_timeout(dev, val); return 0; } static ssize_t gro_flush_timeout_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; return netdev_lock_store(dev, attr, buf, len, change_gro_flush_timeout); } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) { if (val > S32_MAX) return -ERANGE; netdev_set_defer_hard_irqs(dev, (u32)val); return 0; } static ssize_t napi_defer_hard_irqs_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { if (!capable(CAP_NET_ADMIN)) return -EPERM; return netdev_lock_store(dev, attr, buf, len, change_napi_defer_hard_irqs); } NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_uint); static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct net_device *netdev = to_net_dev(dev); struct net *net = dev_net(netdev); size_t count = len; ssize_t ret; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; /* ignore trailing newline */ if (len > 0 && buf[len - 1] == '\n') --count; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = dev_set_alias(netdev, buf, count); if (ret < 0) goto err; ret = len; netdev_state_change(netdev); err: rtnl_unlock(); return ret; } static ssize_t ifalias_show(struct device *dev, struct device_attribute *attr, char *buf) { const struct net_device *netdev = to_net_dev(dev); char tmp[IFALIASZ]; ssize_t ret; ret = dev_get_alias(netdev, tmp, sizeof(tmp)); if (ret > 0) ret = sysfs_emit(buf, "%s\n", tmp); return ret; } static DEVICE_ATTR_RW(ifalias); static int change_group(struct net_device *dev, unsigned long new_group) { dev_set_group(dev, (int)new_group); return 0; } static ssize_t group_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_group); } NETDEVICE_SHOW(group, fmt_dec); static DEVICE_ATTR(netdev_group, 0644, group_show, group_store); static int change_proto_down(struct net_device *dev, unsigned long proto_down) { return dev_change_proto_down(dev, (bool)proto_down); } static ssize_t proto_down_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_store(dev, attr, buf, len, change_proto_down); } NETDEVICE_SHOW_RW(proto_down, fmt_dec); static ssize_t phys_port_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); struct netdev_phys_item_id ppid; ssize_t ret; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = dev_get_phys_port_id(netdev, &ppid); if (!ret) ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_port_id); static ssize_t phys_port_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); char name[IFNAMSIZ]; ssize_t ret; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = dev_get_phys_port_name(netdev, name, sizeof(name)); if (!ret) ret = sysfs_emit(buf, "%s\n", name); rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_port_name); static ssize_t phys_switch_id_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); struct netdev_phys_item_id ppid = { }; ssize_t ret; ret = sysfs_rtnl_lock(&dev->kobj, &attr->attr, netdev); if (ret) return ret; ret = netif_get_port_parent_id(netdev, &ppid, false); if (!ret) ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); rtnl_unlock(); return ret; } static DEVICE_ATTR_RO(phys_switch_id); static struct attribute *netdev_phys_attrs[] __ro_after_init = { &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, NULL, }; static umode_t netdev_phys_is_visible(struct kobject *kobj, struct attribute *attr, int index) { struct device *dev = kobj_to_dev(kobj); struct net_device *netdev = to_net_dev(dev); if (attr == &dev_attr_phys_port_id.attr) { if (!netdev->netdev_ops->ndo_get_phys_port_id) return 0; } else if (attr == &dev_attr_phys_port_name.attr) { if (!netdev->netdev_ops->ndo_get_phys_port_name && !netdev->devlink_port) return 0; } else if (attr == &dev_attr_phys_switch_id.attr) { if (!netdev->netdev_ops->ndo_get_port_parent_id && !netdev->devlink_port) return 0; } return attr->mode; } static const struct attribute_group netdev_phys_group = { .attrs = netdev_phys_attrs, .is_visible = netdev_phys_is_visible, }; static ssize_t threaded_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *netdev = to_net_dev(dev); ssize_t ret = -EINVAL; rcu_read_lock(); if (dev_isalive(netdev)) ret = sysfs_emit(buf, fmt_dec, READ_ONCE(netdev->threaded)); rcu_read_unlock(); return ret; } static int modify_napi_threaded(struct net_device *dev, unsigned long val) { int ret; if (list_empty(&dev->napi_list)) return -EOPNOTSUPP; if (val != 0 && val != 1) return -EOPNOTSUPP; ret = netif_set_threaded(dev, val); return ret; } static ssize_t threaded_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { return netdev_lock_store(dev, attr, buf, len, modify_napi_threaded); } static DEVICE_ATTR_RW(threaded); static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, &dev_attr_dev_id.attr, &dev_attr_dev_port.attr, &dev_attr_iflink.attr, &dev_attr_ifindex.attr, &dev_attr_name_assign_type.attr, &dev_attr_addr_assign_type.attr, &dev_attr_addr_len.attr, &dev_attr_link_mode.attr, &dev_attr_address.attr, &dev_attr_broadcast.attr, &dev_attr_speed.attr, &dev_attr_duplex.attr, &dev_attr_dormant.attr, &dev_attr_testing.attr, &dev_attr_operstate.attr, &dev_attr_carrier_changes.attr, &dev_attr_ifalias.attr, &dev_attr_carrier.attr, &dev_attr_mtu.attr, &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, &dev_attr_napi_defer_hard_irqs.attr, &dev_attr_proto_down.attr, &dev_attr_carrier_up_count.attr, &dev_attr_carrier_down_count.attr, &dev_attr_threaded.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); /* Show a given an attribute in the statistics group */ static ssize_t netstat_show(const struct device *d, struct device_attribute *attr, char *buf, unsigned long offset) { struct net_device *dev = to_net_dev(d); ssize_t ret = -EINVAL; WARN_ON(offset > sizeof(struct rtnl_link_stats64) || offset % sizeof(u64) != 0); rcu_read_lock(); if (dev_isalive(dev)) { struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } rcu_read_unlock(); return ret; } /* generate a read-only statistics attribute */ #define NETSTAT_ENTRY(name) \ static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *buf) \ { \ return netstat_show(d, attr, buf, \ offsetof(struct rtnl_link_stats64, name)); \ } \ static DEVICE_ATTR_RO(name) NETSTAT_ENTRY(rx_packets); NETSTAT_ENTRY(tx_packets); NETSTAT_ENTRY(rx_bytes); NETSTAT_ENTRY(tx_bytes); NETSTAT_ENTRY(rx_errors); NETSTAT_ENTRY(tx_errors); NETSTAT_ENTRY(rx_dropped); NETSTAT_ENTRY(tx_dropped); NETSTAT_ENTRY(multicast); NETSTAT_ENTRY(collisions); NETSTAT_ENTRY(rx_length_errors); NETSTAT_ENTRY(rx_over_errors); NETSTAT_ENTRY(rx_crc_errors); NETSTAT_ENTRY(rx_frame_errors); NETSTAT_ENTRY(rx_fifo_errors); NETSTAT_ENTRY(rx_missed_errors); NETSTAT_ENTRY(tx_aborted_errors); NETSTAT_ENTRY(tx_carrier_errors); NETSTAT_ENTRY(tx_fifo_errors); NETSTAT_ENTRY(tx_heartbeat_errors); NETSTAT_ENTRY(tx_window_errors); NETSTAT_ENTRY(rx_compressed); NETSTAT_ENTRY(tx_compressed); NETSTAT_ENTRY(rx_nohandler); static struct attribute *netstat_attrs[] __ro_after_init = { &dev_attr_rx_packets.attr, &dev_attr_tx_packets.attr, &dev_attr_rx_bytes.attr, &dev_attr_tx_bytes.attr, &dev_attr_rx_errors.attr, &dev_attr_tx_errors.attr, &dev_attr_rx_dropped.attr, &dev_attr_tx_dropped.attr, &dev_attr_multicast.attr, &dev_attr_collisions.attr, &dev_attr_rx_length_errors.attr, &dev_attr_rx_over_errors.attr, &dev_attr_rx_crc_errors.attr, &dev_attr_rx_frame_errors.attr, &dev_attr_rx_fifo_errors.attr, &dev_attr_rx_missed_errors.attr, &dev_attr_tx_aborted_errors.attr, &dev_attr_tx_carrier_errors.attr, &dev_attr_tx_fifo_errors.attr, &dev_attr_tx_heartbeat_errors.attr, &dev_attr_tx_window_errors.attr, &dev_attr_rx_compressed.attr, &dev_attr_tx_compressed.attr, &dev_attr_rx_nohandler.attr, NULL }; static const struct attribute_group netstat_group = { .name = "statistics", .attrs = netstat_attrs, }; static struct attribute *wireless_attrs[] = { NULL }; static const struct attribute_group wireless_group = { .name = "wireless", .attrs = wireless_attrs, }; static bool wireless_group_needed(struct net_device *ndev) { #if IS_ENABLED(CONFIG_CFG80211) if (ndev->ieee80211_ptr) return true; #endif #if IS_ENABLED(CONFIG_WIRELESS_EXT) if (ndev->wireless_handlers) return true; #endif return false; } #else /* CONFIG_SYSFS */ #define net_class_groups NULL #endif /* CONFIG_SYSFS */ #ifdef CONFIG_SYSFS #define to_rx_queue_attr(_attr) \ container_of(_attr, struct rx_queue_attribute, attr) #define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj) static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); struct netdev_rx_queue *queue = to_rx_queue(kobj); if (!attribute->show) return -EIO; return attribute->show(queue, buf); } static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { const struct rx_queue_attribute *attribute = to_rx_queue_attr(attr); struct netdev_rx_queue *queue = to_rx_queue(kobj); if (!attribute->store) return -EIO; return attribute->store(queue, buf, count); } static const struct sysfs_ops rx_queue_sysfs_ops = { .show = rx_queue_attr_show, .store = rx_queue_attr_store, }; #ifdef CONFIG_RPS static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) { struct rps_map *map; cpumask_var_t mask; int i, len; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; rcu_read_lock(); map = rcu_dereference(queue->rps_map); if (map) for (i = 0; i < map->len; i++) cpumask_set_cpu(map->cpus[i], mask); len = sysfs_emit(buf, "%*pb\n", cpumask_pr_args(mask)); rcu_read_unlock(); free_cpumask_var(mask); return len < PAGE_SIZE ? len : -EINVAL; } static int netdev_rx_queue_set_rps_mask(struct netdev_rx_queue *queue, cpumask_var_t mask) { static DEFINE_MUTEX(rps_map_mutex); struct rps_map *old_map, *map; int cpu, i; map = kzalloc(max_t(unsigned int, RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES), GFP_KERNEL); if (!map) return -ENOMEM; i = 0; for_each_cpu_and(cpu, mask, cpu_online_mask) map->cpus[i++] = cpu; if (i) { map->len = i; } else { kfree(map); map = NULL; } mutex_lock(&rps_map_mutex); old_map = rcu_dereference_protected(queue->rps_map, mutex_is_locked(&rps_map_mutex)); rcu_assign_pointer(queue->rps_map, map); if (map) static_branch_inc(&rps_needed); if (old_map) static_branch_dec(&rps_needed); mutex_unlock(&rps_map_mutex); if (old_map) kfree_rcu(old_map, rcu); return 0; } int rps_cpumask_housekeeping(struct cpumask *mask) { if (!cpumask_empty(mask)) { cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); if (cpumask_empty(mask)) return -EINVAL; } return 0; } static ssize_t store_rps_map(struct netdev_rx_queue *queue, const char *buf, size_t len) { cpumask_var_t mask; int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); if (err) goto out; err = rps_cpumask_housekeeping(mask); if (err) goto out; err = netdev_rx_queue_set_rps_mask(queue, mask); out: free_cpumask_var(mask); return err ? : len; } static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, char *buf) { struct rps_dev_flow_table *flow_table; unsigned long val = 0; rcu_read_lock(); flow_table = rcu_dereference(queue->rps_flow_table); if (flow_table) val = 1UL << flow_table->log; rcu_read_unlock(); return sysfs_emit(buf, "%lu\n", val); } static void rps_dev_flow_table_release(struct rcu_head *rcu) { struct rps_dev_flow_table *table = container_of(rcu, struct rps_dev_flow_table, rcu); vfree(table); } static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, const char *buf, size_t len) { unsigned long mask, count; struct rps_dev_flow_table *table, *old_table; static DEFINE_SPINLOCK(rps_dev_flow_lock); int rc; if (!capable(CAP_NET_ADMIN)) return -EPERM; rc = kstrtoul(buf, 0, &count); if (rc < 0) return rc; if (count) { mask = count - 1; /* mask = roundup_pow_of_two(count) - 1; * without overflows... */ while ((mask | (mask >> 1)) != mask) mask |= (mask >> 1); /* On 64 bit arches, must check mask fits in table->mask (u32), * and on 32bit arches, must check * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow. */ #if BITS_PER_LONG > 32 if (mask > (unsigned long)(u32)mask) return -EINVAL; #else if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1)) / sizeof(struct rps_dev_flow)) { /* Enforce a limit to prevent overflow */ return -EINVAL; } #endif table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1)); if (!table) return -ENOMEM; table->log = ilog2(mask) + 1; for (count = 0; count <= mask; count++) { table->flows[count].cpu = RPS_NO_CPU; table->flows[count].filter = RPS_NO_FILTER; } } else { table = NULL; } spin_lock(&rps_dev_flow_lock); old_table = rcu_dereference_protected(queue->rps_flow_table, lockdep_is_held(&rps_dev_flow_lock)); rcu_assign_pointer(queue->rps_flow_table, table); spin_unlock(&rps_dev_flow_lock); if (old_table) call_rcu(&old_table->rcu, rps_dev_flow_table_release); return len; } static struct rx_queue_attribute rps_cpus_attribute __ro_after_init = __ATTR(rps_cpus, 0644, show_rps_map, store_rps_map); static struct rx_queue_attribute rps_dev_flow_table_cnt_attribute __ro_after_init = __ATTR(rps_flow_cnt, 0644, show_rps_dev_flow_table_cnt, store_rps_dev_flow_table_cnt); #endif /* CONFIG_RPS */ static struct attribute *rx_queue_default_attrs[] __ro_after_init = { #ifdef CONFIG_RPS &rps_cpus_attribute.attr, &rps_dev_flow_table_cnt_attribute.attr, #endif NULL }; ATTRIBUTE_GROUPS(rx_queue_default); static void rx_queue_release(struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); #ifdef CONFIG_RPS struct rps_map *map; struct rps_dev_flow_table *flow_table; map = rcu_dereference_protected(queue->rps_map, 1); if (map) { RCU_INIT_POINTER(queue->rps_map, NULL); kfree_rcu(map, rcu); } flow_table = rcu_dereference_protected(queue->rps_flow_table, 1); if (flow_table) { RCU_INIT_POINTER(queue->rps_flow_table, NULL); call_rcu(&flow_table->rcu, rps_dev_flow_table_release); } #endif memset(kobj, 0, sizeof(*kobj)); netdev_put(queue->dev, &queue->dev_tracker); } static const void *rx_queue_namespace(const struct kobject *kobj) { struct netdev_rx_queue *queue = to_rx_queue(kobj); struct device *dev = &queue->dev->dev; const void *ns = NULL; if (dev->class && dev->class->namespace) ns = dev->class->namespace(dev); return ns; } static void rx_queue_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { const struct net *net = rx_queue_namespace(kobj); net_ns_get_ownership(net, uid, gid); } static const struct kobj_type rx_queue_ktype = { .sysfs_ops = &rx_queue_sysfs_ops, .release = rx_queue_release, .namespace = rx_queue_namespace, .get_ownership = rx_queue_get_ownership, }; static int rx_queue_default_mask(struct net_device *dev, struct netdev_rx_queue *queue) { #if IS_ENABLED(CONFIG_RPS) && IS_ENABLED(CONFIG_SYSCTL) struct cpumask *rps_default_mask; int res = 0; mutex_lock(&rps_default_mask_mutex); rps_default_mask = dev_net(dev)->core.rps_default_mask; if (rps_default_mask && !cpumask_empty(rps_default_mask)) res = netdev_rx_queue_set_rps_mask(queue, rps_default_mask); mutex_unlock(&rps_default_mask_mutex); return res; #else return 0; #endif } static int rx_queue_add_kobject(struct net_device *dev, int index) { struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error = 0; /* Rx queues are cleared in rx_queue_release to allow later * re-registration. This is triggered when their kobj refcount is * dropped. * * If a queue is removed while both a read (or write) operation and a * the re-addition of the same queue are pending (waiting on rntl_lock) * it might happen that the re-addition will execute before the read, * making the initial removal to never happen (queue's kobj refcount * won't drop enough because of the pending read). In such rare case, * return to allow the removal operation to complete. */ if (unlikely(kobj->state_initialized)) { netdev_warn_once(dev, "Cannot re-add rx queues before their removal completed"); return -EAGAIN; } /* Kobject_put later will trigger rx_queue_release call which * decreases dev refcount: Take that reference here */ netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL, "rx-%u", index); if (error) goto err; queue->groups = rx_queue_default_groups; error = sysfs_create_groups(kobj, queue->groups); if (error) goto err; if (dev->sysfs_rx_queue_group) { error = sysfs_create_group(kobj, dev->sysfs_rx_queue_group); if (error) goto err_default_groups; } error = rx_queue_default_mask(dev, queue); if (error) goto err_default_groups; kobject_uevent(kobj, KOBJ_ADD); return error; err_default_groups: sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; } static int rx_queue_change_owner(struct net_device *dev, int index, kuid_t kuid, kgid_t kgid) { struct netdev_rx_queue *queue = dev->_rx + index; struct kobject *kobj = &queue->kobj; int error; error = sysfs_change_owner(kobj, kuid, kgid); if (error) return error; if (dev->sysfs_rx_queue_group) error = sysfs_group_change_owner( kobj, dev->sysfs_rx_queue_group, kuid, kgid); return error; } #endif /* CONFIG_SYSFS */ int net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; #ifndef CONFIG_RPS if (!dev->sysfs_rx_queue_group) return 0; #endif for (i = old_num; i < new_num; i++) { error = rx_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; } } while (--i >= new_num) { struct netdev_rx_queue *queue = &dev->_rx[i]; struct kobject *kobj = &queue->kobj; if (!check_net(dev_net(dev))) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); sysfs_remove_groups(kobj, queue->groups); kobject_put(kobj); } return error; #else return 0; #endif } static int net_rx_queue_change_owner(struct net_device *dev, int num, kuid_t kuid, kgid_t kgid) { #ifdef CONFIG_SYSFS int error = 0; int i; #ifndef CONFIG_RPS if (!dev->sysfs_rx_queue_group) return 0; #endif for (i = 0; i < num; i++) { error = rx_queue_change_owner(dev, i, kuid, kgid); if (error) break; } return error; #else return 0; #endif } #ifdef CONFIG_SYSFS /* * netdev_queue sysfs structures and functions. */ struct netdev_queue_attribute { struct attribute attr; ssize_t (*show)(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf); ssize_t (*store)(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len); }; #define to_netdev_queue_attr(_attr) \ container_of(_attr, struct netdev_queue_attribute, attr) #define to_netdev_queue(obj) container_of(obj, struct netdev_queue, kobj) static ssize_t netdev_queue_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { const struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); struct netdev_queue *queue = to_netdev_queue(kobj); if (!attribute->show) return -EIO; return attribute->show(kobj, attr, queue, buf); } static ssize_t netdev_queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { const struct netdev_queue_attribute *attribute = to_netdev_queue_attr(attr); struct netdev_queue *queue = to_netdev_queue(kobj); if (!attribute->store) return -EIO; return attribute->store(kobj, attr, queue, buf, count); } static const struct sysfs_ops netdev_queue_sysfs_ops = { .show = netdev_queue_attr_show, .store = netdev_queue_attr_store, }; static ssize_t tx_timeout_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); return sysfs_emit(buf, fmt_ulong, trans_timeout); } static unsigned int get_netdev_queue_index(struct netdev_queue *queue) { struct net_device *dev = queue->dev; unsigned int i; i = queue - dev->_tx; BUG_ON(i >= dev->num_tx_queues); return i; } static ssize_t traffic_class_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; int num_tc, tc, index, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; ret = sysfs_rtnl_lock(kobj, attr, queue->dev); if (ret) return ret; index = get_netdev_queue_index(queue); /* If queue belongs to subordinate dev use its TC mapping */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; num_tc = dev->num_tc; tc = netdev_txq_to_tc(dev, index); rtnl_unlock(); if (tc < 0) return -EINVAL; /* We can report the traffic class one of two ways: * Subordinate device traffic classes are reported with the traffic * class first, and then the subordinate class so for example TC0 on * subordinate device 2 will be reported as "0-2". If the queue * belongs to the root device it will be reported with just the * traffic class, so just "0" for TC 0 for example. */ return num_tc < 0 ? sysfs_emit(buf, "%d%d\n", tc, num_tc) : sysfs_emit(buf, "%d\n", tc); } #ifdef CONFIG_XPS static ssize_t tx_maxrate_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); } static ssize_t tx_maxrate_store(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { int err, index = get_netdev_queue_index(queue); struct net_device *dev = queue->dev; u32 rate = 0; if (!capable(CAP_NET_ADMIN)) return -EPERM; /* The check is also done later; this helps returning early without * hitting the locking section below. */ if (!dev->netdev_ops->ndo_set_tx_maxrate) return -EOPNOTSUPP; err = kstrtou32(buf, 10, &rate); if (err < 0) return err; err = sysfs_rtnl_lock(kobj, attr, dev); if (err) return err; err = -EOPNOTSUPP; netdev_lock_ops(dev); if (dev->netdev_ops->ndo_set_tx_maxrate) err = dev->netdev_ops->ndo_set_tx_maxrate(dev, index, rate); netdev_unlock_ops(dev); if (!err) { queue->tx_maxrate = rate; rtnl_unlock(); return len; } rtnl_unlock(); return err; } static struct netdev_queue_attribute queue_tx_maxrate __ro_after_init = __ATTR_RW(tx_maxrate); #endif static struct netdev_queue_attribute queue_trans_timeout __ro_after_init = __ATTR_RO(tx_timeout); static struct netdev_queue_attribute queue_traffic_class __ro_after_init = __ATTR_RO(traffic_class); #ifdef CONFIG_BQL /* * Byte queue limits sysfs structures and functions. */ static ssize_t bql_show(char *buf, unsigned int value) { return sysfs_emit(buf, "%u\n", value); } static ssize_t bql_set(const char *buf, const size_t count, unsigned int *pvalue) { unsigned int value; int err; if (!strcmp(buf, "max") || !strcmp(buf, "max\n")) { value = DQL_MAX_LIMIT; } else { err = kstrtouint(buf, 10, &value); if (err < 0) return err; if (value > DQL_MAX_LIMIT) return -EINVAL; } *pvalue = value; return count; } static ssize_t bql_show_hold_time(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } static ssize_t bql_set_hold_time(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { struct dql *dql = &queue->dql; unsigned int value; int err; err = kstrtouint(buf, 10, &value); if (err < 0) return err; dql->slack_hold_time = msecs_to_jiffies(value); return len; } static struct netdev_queue_attribute bql_hold_time_attribute __ro_after_init = __ATTR(hold_time, 0644, bql_show_hold_time, bql_set_hold_time); static ssize_t bql_show_stall_thrs(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->stall_thrs)); } static ssize_t bql_set_stall_thrs(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { struct dql *dql = &queue->dql; unsigned int value; int err; err = kstrtouint(buf, 10, &value); if (err < 0) return err; value = msecs_to_jiffies(value); if (value && (value < 4 || value > 4 / 2 * BITS_PER_LONG)) return -ERANGE; if (!dql->stall_thrs && value) dql->last_reap = jiffies; /* Force last_reap to be live */ smp_wmb(); dql->stall_thrs = value; return len; } static struct netdev_queue_attribute bql_stall_thrs_attribute __ro_after_init = __ATTR(stall_thrs, 0644, bql_show_stall_thrs, bql_set_stall_thrs); static ssize_t bql_show_stall_max(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { return sysfs_emit(buf, "%u\n", READ_ONCE(queue->dql.stall_max)); } static ssize_t bql_set_stall_max(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { WRITE_ONCE(queue->dql.stall_max, 0); return len; } static struct netdev_queue_attribute bql_stall_max_attribute __ro_after_init = __ATTR(stall_max, 0644, bql_show_stall_max, bql_set_stall_max); static ssize_t bql_show_stall_cnt(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%lu\n", dql->stall_cnt); } static struct netdev_queue_attribute bql_stall_cnt_attribute __ro_after_init = __ATTR(stall_cnt, 0444, bql_show_stall_cnt, NULL); static ssize_t bql_show_inflight(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct dql *dql = &queue->dql; return sysfs_emit(buf, "%u\n", dql->num_queued - dql->num_completed); } static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = __ATTR(inflight, 0444, bql_show_inflight, NULL); #define BQL_ATTR(NAME, FIELD) \ static ssize_t bql_show_ ## NAME(struct kobject *kobj, \ struct attribute *attr, \ struct netdev_queue *queue, char *buf) \ { \ return bql_show(buf, queue->dql.FIELD); \ } \ \ static ssize_t bql_set_ ## NAME(struct kobject *kobj, \ struct attribute *attr, \ struct netdev_queue *queue, \ const char *buf, size_t len) \ { \ return bql_set(buf, len, &queue->dql.FIELD); \ } \ \ static struct netdev_queue_attribute bql_ ## NAME ## _attribute __ro_after_init \ = __ATTR(NAME, 0644, \ bql_show_ ## NAME, bql_set_ ## NAME) BQL_ATTR(limit, limit); BQL_ATTR(limit_max, max_limit); BQL_ATTR(limit_min, min_limit); static struct attribute *dql_attrs[] __ro_after_init = { &bql_limit_attribute.attr, &bql_limit_max_attribute.attr, &bql_limit_min_attribute.attr, &bql_hold_time_attribute.attr, &bql_inflight_attribute.attr, &bql_stall_thrs_attribute.attr, &bql_stall_cnt_attribute.attr, &bql_stall_max_attribute.attr, NULL }; static const struct attribute_group dql_group = { .name = "byte_queue_limits", .attrs = dql_attrs, }; #else /* Fake declaration, all the code using it should be dead */ static const struct attribute_group dql_group = {}; #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, int tc, char *buf, enum xps_map_type type) { struct xps_dev_maps *dev_maps; unsigned long *mask; unsigned int nr_ids; int j, len; rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps[type]); /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0 * when dev_maps hasn't been allocated yet, to be backward compatible. */ nr_ids = dev_maps ? dev_maps->nr_ids : (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues); mask = bitmap_zalloc(nr_ids, GFP_NOWAIT); if (!mask) { rcu_read_unlock(); return -ENOMEM; } if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; for (j = 0; j < nr_ids; j++) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; map = rcu_dereference(dev_maps->attr_map[tci]); if (!map) continue; for (i = map->len; i--;) { if (map->queues[i] == index) { __set_bit(j, mask); break; } } } out_no_maps: rcu_read_unlock(); len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; } static ssize_t xps_cpus_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; int len, tc, ret; if (!netif_is_multiqueue(dev)) return -ENOENT; index = get_netdev_queue_index(queue); ret = sysfs_rtnl_lock(kobj, attr, queue->dev); if (ret) return ret; /* If queue belongs to subordinate dev use its map */ dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; tc = netdev_txq_to_tc(dev, index); if (tc < 0) { rtnl_unlock(); return -EINVAL; } /* Increase the net device refcnt to make sure it won't be freed while * xps_queue_show is running. */ dev_hold(dev); rtnl_unlock(); len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); dev_put(dev); return len; } static ssize_t xps_cpus_store(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; unsigned int index; cpumask_var_t mask; int err; if (!netif_is_multiqueue(dev)) return -ENOENT; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) return -ENOMEM; index = get_netdev_queue_index(queue); err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits); if (err) { free_cpumask_var(mask); return err; } err = sysfs_rtnl_lock(kobj, attr, dev); if (err) { free_cpumask_var(mask); return err; } err = netif_set_xps_queue(dev, mask, index); rtnl_unlock(); free_cpumask_var(mask); return err ? : len; } static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init = __ATTR_RW(xps_cpus); static ssize_t xps_rxqs_show(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; unsigned int index; int tc, ret; index = get_netdev_queue_index(queue); ret = sysfs_rtnl_lock(kobj, attr, dev); if (ret) return ret; tc = netdev_txq_to_tc(dev, index); /* Increase the net device refcnt to make sure it won't be freed while * xps_queue_show is running. */ dev_hold(dev); rtnl_unlock(); ret = tc >= 0 ? xps_queue_show(dev, index, tc, buf, XPS_RXQS) : -EINVAL; dev_put(dev); return ret; } static ssize_t xps_rxqs_store(struct kobject *kobj, struct attribute *attr, struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; struct net *net = dev_net(dev); unsigned long *mask; unsigned int index; int err; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); if (!mask) return -ENOMEM; index = get_netdev_queue_index(queue); err = bitmap_parse(buf, len, mask, dev->num_rx_queues); if (err) { bitmap_free(mask); return err; } err = sysfs_rtnl_lock(kobj, attr, dev); if (err) { bitmap_free(mask); return err; } cpus_read_lock(); err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS); cpus_read_unlock(); rtnl_unlock(); bitmap_free(mask); return err ? : len; } static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init = __ATTR_RW(xps_rxqs); #endif /* CONFIG_XPS */ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = { &queue_trans_timeout.attr, &queue_traffic_class.attr, #ifdef CONFIG_XPS &xps_cpus_attribute.attr, &xps_rxqs_attribute.attr, &queue_tx_maxrate.attr, #endif NULL }; ATTRIBUTE_GROUPS(netdev_queue_default); static void netdev_queue_release(struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); memset(kobj, 0, sizeof(*kobj)); netdev_put(queue->dev, &queue->dev_tracker); } static const void *netdev_queue_namespace(const struct kobject *kobj) { struct netdev_queue *queue = to_netdev_queue(kobj); struct device *dev = &queue->dev->dev; const void *ns = NULL; if (dev->class && dev->class->namespace) ns = dev->class->namespace(dev); return ns; } static void netdev_queue_get_ownership(const struct kobject *kobj, kuid_t *uid, kgid_t *gid) { const struct net *net = netdev_queue_namespace(kobj); net_ns_get_ownership(net, uid, gid); } static const struct kobj_type netdev_queue_ktype = { .sysfs_ops = &netdev_queue_sysfs_ops, .release = netdev_queue_release, .namespace = netdev_queue_namespace, .get_ownership = netdev_queue_get_ownership, }; static bool netdev_uses_bql(const struct net_device *dev) { if (dev->lltx || (dev->priv_flags & IFF_NO_QUEUE)) return false; return IS_ENABLED(CONFIG_BQL); } static int netdev_queue_add_kobject(struct net_device *dev, int index) { struct netdev_queue *queue = dev->_tx + index; struct kobject *kobj = &queue->kobj; int error = 0; /* Tx queues are cleared in netdev_queue_release to allow later * re-registration. This is triggered when their kobj refcount is * dropped. * * If a queue is removed while both a read (or write) operation and a * the re-addition of the same queue are pending (waiting on rntl_lock) * it might happen that the re-addition will execute before the read, * making the initial removal to never happen (queue's kobj refcount * won't drop enough because of the pending read). In such rare case, * return to allow the removal operation to complete. */ if (unlikely(kobj->state_initialized)) { netdev_warn_once(dev, "Cannot re-add tx queues before their removal completed"); return -EAGAIN; } /* Kobject_put later will trigger netdev_queue_release call * which decreases dev refcount: Take that reference here */ netdev_hold(queue->dev, &queue->dev_tracker, GFP_KERNEL); kobj->kset = dev->queues_kset; error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL, "tx-%u", index); if (error) goto err; queue->groups = netdev_queue_default_groups; error = sysfs_create_groups(kobj, queue->groups); if (error) goto err; if (netdev_uses_bql(dev)) { error = sysfs_create_group(kobj, &dql_group); if (error) goto err_default_groups; } kobject_uevent(kobj, KOBJ_ADD); return 0; err_default_groups: sysfs_remove_groups(kobj, queue->groups); err: kobject_put(kobj); return error; } static int tx_queue_change_owner(struct net_device *ndev, int index, kuid_t kuid, kgid_t kgid) { struct netdev_queue *queue = ndev->_tx + index; struct kobject *kobj = &queue->kobj; int error; error = sysfs_change_owner(kobj, kuid, kgid); if (error) return error; if (netdev_uses_bql(ndev)) error = sysfs_group_change_owner(kobj, &dql_group, kuid, kgid); return error; } #endif /* CONFIG_SYSFS */ int netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) { #ifdef CONFIG_SYSFS int i; int error = 0; /* Tx queue kobjects are allowed to be updated when a device is being * unregistered, but solely to remove queues from qdiscs. Any path * adding queues should be fixed. */ WARN(dev->reg_state == NETREG_UNREGISTERING && new_num > old_num, "New queues can't be registered after device unregistration."); for (i = old_num; i < new_num; i++) { error = netdev_queue_add_kobject(dev, i); if (error) { new_num = old_num; break; } } while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; if (!check_net(dev_net(dev))) queue->kobj.uevent_suppress = 1; if (netdev_uses_bql(dev)) sysfs_remove_group(&queue->kobj, &dql_group); sysfs_remove_groups(&queue->kobj, queue->groups); kobject_put(&queue->kobj); } return error; #else return 0; #endif /* CONFIG_SYSFS */ } static int net_tx_queue_change_owner(struct net_device *dev, int num, kuid_t kuid, kgid_t kgid) { #ifdef CONFIG_SYSFS int error = 0; int i; for (i = 0; i < num; i++) { error = tx_queue_change_owner(dev, i, kuid, kgid); if (error) break; } return error; #else return 0; #endif /* CONFIG_SYSFS */ } static int register_queue_kobjects(struct net_device *dev) { int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS dev->queues_kset = kset_create_and_add("queues", NULL, &dev->dev.kobj); if (!dev->queues_kset) return -ENOMEM; real_rx = dev->real_num_rx_queues; #endif real_tx = dev->real_num_tx_queues; error = net_rx_queue_update_kobjects(dev, 0, real_rx); if (error) goto error; rxq = real_rx; error = netdev_queue_update_kobjects(dev, 0, real_tx); if (error) goto error; txq = real_tx; return 0; error: netdev_queue_update_kobjects(dev, txq, 0); net_rx_queue_update_kobjects(dev, rxq, 0); #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif return error; } static int queue_change_owner(struct net_device *ndev, kuid_t kuid, kgid_t kgid) { int error = 0, real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS if (ndev->queues_kset) { error = sysfs_change_owner(&ndev->queues_kset->kobj, kuid, kgid); if (error) return error; } real_rx = ndev->real_num_rx_queues; #endif real_tx = ndev->real_num_tx_queues; error = net_rx_queue_change_owner(ndev, real_rx, kuid, kgid); if (error) return error; error = net_tx_queue_change_owner(ndev, real_tx, kuid, kgid); if (error) return error; return 0; } static void remove_queue_kobjects(struct net_device *dev) { int real_rx = 0, real_tx = 0; #ifdef CONFIG_SYSFS real_rx = dev->real_num_rx_queues; #endif real_tx = dev->real_num_tx_queues; net_rx_queue_update_kobjects(dev, real_rx, 0); netdev_queue_update_kobjects(dev, real_tx, 0); netdev_lock_ops(dev); dev->real_num_rx_queues = 0; dev->real_num_tx_queues = 0; netdev_unlock_ops(dev); #ifdef CONFIG_SYSFS kset_unregister(dev->queues_kset); #endif } static bool net_current_may_mount(void) { struct net *net = current->nsproxy->net_ns; return ns_capable(net->user_ns, CAP_SYS_ADMIN); } static void *net_grab_current_ns(void) { struct net *ns = current->nsproxy->net_ns; #ifdef CONFIG_NET_NS if (ns) refcount_inc(&ns->passive); #endif return ns; } static const void *net_initial_ns(void) { return &init_net; } static const void *net_netlink_ns(struct sock *sk) { return sock_net(sk); } const struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, .current_may_mount = net_current_may_mount, .grab_current_ns = net_grab_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, .drop_ns = net_drop_ns, }; EXPORT_SYMBOL_GPL(net_ns_type_operations); static int netdev_uevent(const struct device *d, struct kobj_uevent_env *env) { const struct net_device *dev = to_net_dev(d); int retval; /* pass interface to uevent. */ retval = add_uevent_var(env, "INTERFACE=%s", dev->name); if (retval) goto exit; /* pass ifindex to uevent. * ifindex is useful as it won't change (interface name may change) * and is what RtNetlink uses natively. */ retval = add_uevent_var(env, "IFINDEX=%d", dev->ifindex); exit: return retval; } /* * netdev_release -- destroy and free a dead device. * Called when last reference to device kobject is gone. */ static void netdev_release(struct device *d) { struct net_device *dev = to_net_dev(d); BUG_ON(dev->reg_state != NETREG_RELEASED); /* no need to wait for rcu grace period: * device is dead and about to be freed. */ kfree(rcu_access_pointer(dev->ifalias)); kvfree(dev); } static const void *net_namespace(const struct device *d) { const struct net_device *dev = to_net_dev(d); return dev_net(dev); } static void net_get_ownership(const struct device *d, kuid_t *uid, kgid_t *gid) { const struct net_device *dev = to_net_dev(d); const struct net *net = dev_net(dev); net_ns_get_ownership(net, uid, gid); } static const struct class net_class = { .name = "net", .dev_release = netdev_release, .dev_groups = net_class_groups, .dev_uevent = netdev_uevent, .ns_type = &net_ns_type_operations, .namespace = net_namespace, .get_ownership = net_get_ownership, }; #ifdef CONFIG_OF static int of_dev_node_match(struct device *dev, const void *data) { for (; dev; dev = dev->parent) { if (dev->of_node == data) return 1; } return 0; } /* * of_find_net_device_by_node - lookup the net device for the device node * @np: OF device node * * Looks up the net_device structure corresponding with the device node. * If successful, returns a pointer to the net_device with the embedded * struct device refcount incremented by one, or NULL on failure. The * refcount must be dropped when done with the net_device. */ struct net_device *of_find_net_device_by_node(struct device_node *np) { struct device *dev; dev = class_find_device(&net_class, NULL, np, of_dev_node_match); if (!dev) return NULL; return to_net_dev(dev); } EXPORT_SYMBOL(of_find_net_device_by_node); #endif /* Delete sysfs entries but hold kobject reference until after all * netdev references are gone. */ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; if (!check_net(dev_net(ndev))) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); remove_queue_kobjects(ndev); pm_runtime_set_memalloc_noio(dev, false); device_del(dev); } /* Create sysfs entries for network device. */ int netdev_register_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; const struct attribute_group **groups = ndev->sysfs_groups; int error = 0; device_initialize(dev); dev->class = &net_class; dev->platform_data = ndev; dev->groups = groups; dev_set_name(dev, "%s", ndev->name); #ifdef CONFIG_SYSFS /* Allow for a device specific group */ if (*groups) groups++; *groups++ = &netstat_group; *groups++ = &netdev_phys_group; if (wireless_group_needed(ndev)) *groups++ = &wireless_group; #endif /* CONFIG_SYSFS */ error = device_add(dev); if (error) return error; error = register_queue_kobjects(ndev); if (error) { device_del(dev); return error; } pm_runtime_set_memalloc_noio(dev, true); return error; } /* Change owner for sysfs entries when moving network devices across network * namespaces owned by different user namespaces. */ int netdev_change_owner(struct net_device *ndev, const struct net *net_old, const struct net *net_new) { kuid_t old_uid = GLOBAL_ROOT_UID, new_uid = GLOBAL_ROOT_UID; kgid_t old_gid = GLOBAL_ROOT_GID, new_gid = GLOBAL_ROOT_GID; struct device *dev = &ndev->dev; int error; net_ns_get_ownership(net_old, &old_uid, &old_gid); net_ns_get_ownership(net_new, &new_uid, &new_gid); /* The network namespace was changed but the owning user namespace is * identical so there's no need to change the owner of sysfs entries. */ if (uid_eq(old_uid, new_uid) && gid_eq(old_gid, new_gid)) return 0; error = device_change_owner(dev, new_uid, new_gid); if (error) return error; error = queue_change_owner(ndev, new_uid, new_gid); if (error) return error; return 0; } int netdev_class_create_file_ns(const struct class_attribute *class_attr, const void *ns) { return class_create_file_ns(&net_class, class_attr, ns); } EXPORT_SYMBOL(netdev_class_create_file_ns); void netdev_class_remove_file_ns(const struct class_attribute *class_attr, const void *ns) { class_remove_file_ns(&net_class, class_attr, ns); } EXPORT_SYMBOL(netdev_class_remove_file_ns); int __init netdev_kobject_init(void) { kobj_ns_type_register(&net_ns_type_operations); return class_register(&net_class); }
46 46 46 46 46 46 46 46 46 46 46 45 46 46 46 46 46 46 46 45 46 46 46 46 46 46 46 46 46 46 46 46 45 46 46 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 /* * Non-physical true random number generator based on timing jitter -- * Jitter RNG standalone code. * * Copyright Stephan Mueller <smueller@chronox.de>, 2015 - 2023 * * Design * ====== * * See https://www.chronox.de/jent.html * * License * ======= * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL2 are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ /* * This Jitterentropy RNG is based on the jitterentropy library * version 3.4.0 provided at https://www.chronox.de/jent.html */ #ifdef __OPTIMIZE__ #error "The CPU Jitter random number generator must not be compiled with optimizations. See documentation. Use the compiler switch -O0 for compiling jitterentropy.c." #endif typedef unsigned long long __u64; typedef long long __s64; typedef unsigned int __u32; typedef unsigned char u8; #define NULL ((void *) 0) /* The entropy pool */ struct rand_data { /* SHA3-256 is used as conditioner */ #define DATA_SIZE_BITS 256 /* all data values that are vital to maintain the security * of the RNG are marked as SENSITIVE. A user must not * access that information while the RNG executes its loops to * calculate the next random value. */ void *hash_state; /* SENSITIVE hash state entropy pool */ __u64 prev_time; /* SENSITIVE Previous time stamp */ __u64 last_delta; /* SENSITIVE stuck test */ __s64 last_delta2; /* SENSITIVE stuck test */ unsigned int flags; /* Flags used to initialize */ unsigned int osr; /* Oversample rate */ #define JENT_MEMORY_ACCESSLOOPS 128 #define JENT_MEMORY_SIZE \ (CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS * \ CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE) unsigned char *mem; /* Memory access location with size of * memblocks * memblocksize */ unsigned int memlocation; /* Pointer to byte in *mem */ unsigned int memblocks; /* Number of memory blocks in *mem */ unsigned int memblocksize; /* Size of one memory block in bytes */ unsigned int memaccessloops; /* Number of memory accesses per random * bit generation */ /* Repetition Count Test */ unsigned int rct_count; /* Number of stuck values */ /* Adaptive Proportion Test cutoff values */ unsigned int apt_cutoff; /* Intermittent health test failure */ unsigned int apt_cutoff_permanent; /* Permanent health test failure */ #define JENT_APT_WINDOW_SIZE 512 /* Data window size */ /* LSB of time stamp to process */ #define JENT_APT_LSB 16 #define JENT_APT_WORD_MASK (JENT_APT_LSB - 1) unsigned int apt_observations; /* Number of collected observations */ unsigned int apt_count; /* APT counter */ unsigned int apt_base; /* APT base reference */ unsigned int health_failure; /* Record health failure */ unsigned int apt_base_set:1; /* APT base reference set? */ }; /* Flags that can be used to initialize the RNG */ #define JENT_DISABLE_MEMORY_ACCESS (1<<2) /* Disable memory access for more * entropy, saves MEMORY_SIZE RAM for * entropy collector */ /* -- error codes for init function -- */ #define JENT_ENOTIME 1 /* Timer service not available */ #define JENT_ECOARSETIME 2 /* Timer too coarse for RNG */ #define JENT_ENOMONOTONIC 3 /* Timer is not monotonic increasing */ #define JENT_EVARVAR 5 /* Timer does not produce variations of * variations (2nd derivation of time is * zero). */ #define JENT_ESTUCK 8 /* Too many stuck results during init. */ #define JENT_EHEALTH 9 /* Health test failed during initialization */ #define JENT_ERCT 10 /* RCT failed during initialization */ #define JENT_EHASH 11 /* Hash self test failed */ #define JENT_EMEM 12 /* Can't allocate memory for initialization */ #define JENT_RCT_FAILURE 1 /* Failure in RCT health test. */ #define JENT_APT_FAILURE 2 /* Failure in APT health test. */ #define JENT_PERMANENT_FAILURE_SHIFT 16 #define JENT_PERMANENT_FAILURE(x) (x << JENT_PERMANENT_FAILURE_SHIFT) #define JENT_RCT_FAILURE_PERMANENT JENT_PERMANENT_FAILURE(JENT_RCT_FAILURE) #define JENT_APT_FAILURE_PERMANENT JENT_PERMANENT_FAILURE(JENT_APT_FAILURE) /* * The output n bits can receive more than n bits of min entropy, of course, * but the fixed output of the conditioning function can only asymptotically * approach the output size bits of min entropy, not attain that bound. Random * maps will tend to have output collisions, which reduces the creditable * output entropy (that is what SP 800-90B Section 3.1.5.1.2 attempts to bound). * * The value "64" is justified in Appendix A.4 of the current 90C draft, * and aligns with NIST's in "epsilon" definition in this document, which is * that a string can be considered "full entropy" if you can bound the min * entropy in each bit of output to at least 1-epsilon, where epsilon is * required to be <= 2^(-32). */ #define JENT_ENTROPY_SAFETY_FACTOR 64 #include <linux/array_size.h> #include <linux/fips.h> #include <linux/minmax.h> #include "jitterentropy.h" /*************************************************************************** * Adaptive Proportion Test * * This test complies with SP800-90B section 4.4.2. ***************************************************************************/ /* * See the SP 800-90B comment #10b for the corrected cutoff for the SP 800-90B * APT. * https://www.untruth.org/~josh/sp80090b/UL%20SP800-90B-final%20comments%20v1.9%2020191212.pdf * In the syntax of R, this is C = 2 + qbinom(1 − 2^(−30), 511, 2^(-1/osr)). * (The original formula wasn't correct because the first symbol must * necessarily have been observed, so there is no chance of observing 0 of these * symbols.) * * For the alpha < 2^-53, R cannot be used as it uses a float data type without * arbitrary precision. A SageMath script is used to calculate those cutoff * values. * * For any value above 14, this yields the maximal allowable value of 512 * (by FIPS 140-2 IG 7.19 Resolution # 16, we cannot choose a cutoff value that * renders the test unable to fail). */ static const unsigned int jent_apt_cutoff_lookup[15] = { 325, 422, 459, 477, 488, 494, 499, 502, 505, 507, 508, 509, 510, 511, 512 }; static const unsigned int jent_apt_cutoff_permanent_lookup[15] = { 355, 447, 479, 494, 502, 507, 510, 512, 512, 512, 512, 512, 512, 512, 512 }; static void jent_apt_init(struct rand_data *ec, unsigned int osr) { /* * Establish the apt_cutoff based on the presumed entropy rate of * 1/osr. */ if (osr >= ARRAY_SIZE(jent_apt_cutoff_lookup)) { ec->apt_cutoff = jent_apt_cutoff_lookup[ ARRAY_SIZE(jent_apt_cutoff_lookup) - 1]; ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[ ARRAY_SIZE(jent_apt_cutoff_permanent_lookup) - 1]; } else { ec->apt_cutoff = jent_apt_cutoff_lookup[osr - 1]; ec->apt_cutoff_permanent = jent_apt_cutoff_permanent_lookup[osr - 1]; } } /* * Reset the APT counter * * @ec [in] Reference to entropy collector */ static void jent_apt_reset(struct rand_data *ec, unsigned int delta_masked) { /* Reset APT counter */ ec->apt_count = 0; ec->apt_base = delta_masked; ec->apt_observations = 0; } /* * Insert a new entropy event into APT * * @ec [in] Reference to entropy collector * @delta_masked [in] Masked time delta to process */ static void jent_apt_insert(struct rand_data *ec, unsigned int delta_masked) { /* Initialize the base reference */ if (!ec->apt_base_set) { ec->apt_base = delta_masked; ec->apt_base_set = 1; return; } if (delta_masked == ec->apt_base) { ec->apt_count++; /* Note, ec->apt_count starts with one. */ if (ec->apt_count >= ec->apt_cutoff_permanent) ec->health_failure |= JENT_APT_FAILURE_PERMANENT; else if (ec->apt_count >= ec->apt_cutoff) ec->health_failure |= JENT_APT_FAILURE; } ec->apt_observations++; if (ec->apt_observations >= JENT_APT_WINDOW_SIZE) jent_apt_reset(ec, delta_masked); } /*************************************************************************** * Stuck Test and its use as Repetition Count Test * * The Jitter RNG uses an enhanced version of the Repetition Count Test * (RCT) specified in SP800-90B section 4.4.1. Instead of counting identical * back-to-back values, the input to the RCT is the counting of the stuck * values during the generation of one Jitter RNG output block. * * The RCT is applied with an alpha of 2^{-30} compliant to FIPS 140-2 IG 9.8. * * During the counting operation, the Jitter RNG always calculates the RCT * cut-off value of C. If that value exceeds the allowed cut-off value, * the Jitter RNG output block will be calculated completely but discarded at * the end. The caller of the Jitter RNG is informed with an error code. ***************************************************************************/ /* * Repetition Count Test as defined in SP800-90B section 4.4.1 * * @ec [in] Reference to entropy collector * @stuck [in] Indicator whether the value is stuck */ static void jent_rct_insert(struct rand_data *ec, int stuck) { if (stuck) { ec->rct_count++; /* * The cutoff value is based on the following consideration: * alpha = 2^-30 or 2^-60 as recommended in SP800-90B. * In addition, we require an entropy value H of 1/osr as this * is the minimum entropy required to provide full entropy. * Note, we collect (DATA_SIZE_BITS + ENTROPY_SAFETY_FACTOR)*osr * deltas for inserting them into the entropy pool which should * then have (close to) DATA_SIZE_BITS bits of entropy in the * conditioned output. * * Note, ec->rct_count (which equals to value B in the pseudo * code of SP800-90B section 4.4.1) starts with zero. Hence * we need to subtract one from the cutoff value as calculated * following SP800-90B. Thus C = ceil(-log_2(alpha)/H) = 30*osr * or 60*osr. */ if ((unsigned int)ec->rct_count >= (60 * ec->osr)) { ec->rct_count = -1; ec->health_failure |= JENT_RCT_FAILURE_PERMANENT; } else if ((unsigned int)ec->rct_count >= (30 * ec->osr)) { ec->rct_count = -1; ec->health_failure |= JENT_RCT_FAILURE; } } else { /* Reset RCT */ ec->rct_count = 0; } } static inline __u64 jent_delta(__u64 prev, __u64 next) { #define JENT_UINT64_MAX (__u64)(~((__u64) 0)) return (prev < next) ? (next - prev) : (JENT_UINT64_MAX - prev + 1 + next); } /* * Stuck test by checking the: * 1st derivative of the jitter measurement (time delta) * 2nd derivative of the jitter measurement (delta of time deltas) * 3rd derivative of the jitter measurement (delta of delta of time deltas) * * All values must always be non-zero. * * @ec [in] Reference to entropy collector * @current_delta [in] Jitter time delta * * @return * 0 jitter measurement not stuck (good bit) * 1 jitter measurement stuck (reject bit) */ static int jent_stuck(struct rand_data *ec, __u64 current_delta) { __u64 delta2 = jent_delta(ec->last_delta, current_delta); __u64 delta3 = jent_delta(ec->last_delta2, delta2); ec->last_delta = current_delta; ec->last_delta2 = delta2; /* * Insert the result of the comparison of two back-to-back time * deltas. */ jent_apt_insert(ec, current_delta); if (!current_delta || !delta2 || !delta3) { /* RCT with a stuck bit */ jent_rct_insert(ec, 1); return 1; } /* RCT with a non-stuck bit */ jent_rct_insert(ec, 0); return 0; } /* * Report any health test failures * * @ec [in] Reference to entropy collector * * @return a bitmask indicating which tests failed * 0 No health test failure * 1 RCT failure * 2 APT failure * 1<<JENT_PERMANENT_FAILURE_SHIFT RCT permanent failure * 2<<JENT_PERMANENT_FAILURE_SHIFT APT permanent failure */ static unsigned int jent_health_failure(struct rand_data *ec) { /* Test is only enabled in FIPS mode */ if (!fips_enabled) return 0; return ec->health_failure; } /*************************************************************************** * Noise sources ***************************************************************************/ /* * Update of the loop count used for the next round of * an entropy collection. * * Input: * @bits is the number of low bits of the timer to consider * @min is the number of bits we shift the timer value to the right at * the end to make sure we have a guaranteed minimum value * * @return Newly calculated loop counter */ static __u64 jent_loop_shuffle(unsigned int bits, unsigned int min) { __u64 time = 0; __u64 shuffle = 0; unsigned int i = 0; unsigned int mask = (1<<bits) - 1; jent_get_nstime(&time); /* * We fold the time value as much as possible to ensure that as many * bits of the time stamp are included as possible. */ for (i = 0; ((DATA_SIZE_BITS + bits - 1) / bits) > i; i++) { shuffle ^= time & mask; time = time >> bits; } /* * We add a lower boundary value to ensure we have a minimum * RNG loop count. */ return (shuffle + (1<<min)); } /* * CPU Jitter noise source -- this is the noise source based on the CPU * execution time jitter * * This function injects the individual bits of the time value into the * entropy pool using a hash. * * ec [in] entropy collector * time [in] time stamp to be injected * stuck [in] Is the time stamp identified as stuck? * * Output: * updated hash context in the entropy collector or error code */ static int jent_condition_data(struct rand_data *ec, __u64 time, int stuck) { #define SHA3_HASH_LOOP (1<<3) struct { int rct_count; unsigned int apt_observations; unsigned int apt_count; unsigned int apt_base; } addtl = { ec->rct_count, ec->apt_observations, ec->apt_count, ec->apt_base }; return jent_hash_time(ec->hash_state, time, (u8 *)&addtl, sizeof(addtl), SHA3_HASH_LOOP, stuck); } /* * Memory Access noise source -- this is a noise source based on variations in * memory access times * * This function performs memory accesses which will add to the timing * variations due to an unknown amount of CPU wait states that need to be * added when accessing memory. The memory size should be larger than the L1 * caches as outlined in the documentation and the associated testing. * * The L1 cache has a very high bandwidth, albeit its access rate is usually * slower than accessing CPU registers. Therefore, L1 accesses only add minimal * variations as the CPU has hardly to wait. Starting with L2, significant * variations are added because L2 typically does not belong to the CPU any more * and therefore a wider range of CPU wait states is necessary for accesses. * L3 and real memory accesses have even a wider range of wait states. However, * to reliably access either L3 or memory, the ec->mem memory must be quite * large which is usually not desirable. * * @ec [in] Reference to the entropy collector with the memory access data -- if * the reference to the memory block to be accessed is NULL, this noise * source is disabled * @loop_cnt [in] if a value not equal to 0 is set, use the given value * number of loops to perform the LFSR */ static void jent_memaccess(struct rand_data *ec, __u64 loop_cnt) { unsigned int wrap = 0; __u64 i = 0; #define MAX_ACC_LOOP_BIT 7 #define MIN_ACC_LOOP_BIT 0 __u64 acc_loop_cnt = jent_loop_shuffle(MAX_ACC_LOOP_BIT, MIN_ACC_LOOP_BIT); if (NULL == ec || NULL == ec->mem) return; wrap = ec->memblocksize * ec->memblocks; /* * testing purposes -- allow test app to set the counter, not * needed during runtime */ if (loop_cnt) acc_loop_cnt = loop_cnt; for (i = 0; i < (ec->memaccessloops + acc_loop_cnt); i++) { unsigned char *tmpval = ec->mem + ec->memlocation; /* * memory access: just add 1 to one byte, * wrap at 255 -- memory access implies read * from and write to memory location */ *tmpval = (*tmpval + 1) & 0xff; /* * Addition of memblocksize - 1 to pointer * with wrap around logic to ensure that every * memory location is hit evenly */ ec->memlocation = ec->memlocation + ec->memblocksize - 1; ec->memlocation = ec->memlocation % wrap; } } /*************************************************************************** * Start of entropy processing logic ***************************************************************************/ /* * This is the heart of the entropy generation: calculate time deltas and * use the CPU jitter in the time deltas. The jitter is injected into the * entropy pool. * * WARNING: ensure that ->prev_time is primed before using the output * of this function! This can be done by calling this function * and not using its result. * * @ec [in] Reference to entropy collector * * @return result of stuck test */ static int jent_measure_jitter(struct rand_data *ec, __u64 *ret_current_delta) { __u64 time = 0; __u64 current_delta = 0; int stuck; /* Invoke one noise source before time measurement to add variations */ jent_memaccess(ec, 0); /* * Get time stamp and calculate time delta to previous * invocation to measure the timing variations */ jent_get_nstime(&time); current_delta = jent_delta(ec->prev_time, time); ec->prev_time = time; /* Check whether we have a stuck measurement. */ stuck = jent_stuck(ec, current_delta); /* Now call the next noise sources which also injects the data */ if (jent_condition_data(ec, current_delta, stuck)) stuck = 1; /* return the raw entropy value */ if (ret_current_delta) *ret_current_delta = current_delta; return stuck; } /* * Generator of one 64 bit random number * Function fills rand_data->hash_state * * @ec [in] Reference to entropy collector */ static void jent_gen_entropy(struct rand_data *ec) { unsigned int k = 0, safety_factor = 0; if (fips_enabled) safety_factor = JENT_ENTROPY_SAFETY_FACTOR; /* priming of the ->prev_time value */ jent_measure_jitter(ec, NULL); while (!jent_health_failure(ec)) { /* If a stuck measurement is received, repeat measurement */ if (jent_measure_jitter(ec, NULL)) continue; /* * We multiply the loop value with ->osr to obtain the * oversampling rate requested by the caller */ if (++k >= ((DATA_SIZE_BITS + safety_factor) * ec->osr)) break; } } /* * Entry function: Obtain entropy for the caller. * * This function invokes the entropy gathering logic as often to generate * as many bytes as requested by the caller. The entropy gathering logic * creates 64 bit per invocation. * * This function truncates the last 64 bit entropy value output to the exact * size specified by the caller. * * @ec [in] Reference to entropy collector * @data [in] pointer to buffer for storing random data -- buffer must already * exist * @len [in] size of the buffer, specifying also the requested number of random * in bytes * * @return 0 when request is fulfilled or an error * * The following error codes can occur: * -1 entropy_collector is NULL or the generation failed * -2 Intermittent health failure * -3 Permanent health failure */ int jent_read_entropy(struct rand_data *ec, unsigned char *data, unsigned int len) { unsigned char *p = data; if (!ec) return -1; while (len > 0) { unsigned int tocopy, health_test_result; jent_gen_entropy(ec); health_test_result = jent_health_failure(ec); if (health_test_result > JENT_PERMANENT_FAILURE_SHIFT) { /* * At this point, the Jitter RNG instance is considered * as a failed instance. There is no rerun of the * startup test any more, because the caller * is assumed to not further use this instance. */ return -3; } else if (health_test_result) { /* * Perform startup health tests and return permanent * error if it fails. */ if (jent_entropy_init(0, 0, NULL, ec)) { /* Mark the permanent error */ ec->health_failure &= JENT_RCT_FAILURE_PERMANENT | JENT_APT_FAILURE_PERMANENT; return -3; } return -2; } tocopy = min(DATA_SIZE_BITS / 8, len); if (jent_read_random_block(ec->hash_state, p, tocopy)) return -1; len -= tocopy; p += tocopy; } return 0; } /*************************************************************************** * Initialization logic ***************************************************************************/ struct rand_data *jent_entropy_collector_alloc(unsigned int osr, unsigned int flags, void *hash_state) { struct rand_data *entropy_collector; entropy_collector = jent_zalloc(sizeof(struct rand_data)); if (!entropy_collector) return NULL; if (!(flags & JENT_DISABLE_MEMORY_ACCESS)) { /* Allocate memory for adding variations based on memory * access */ entropy_collector->mem = jent_kvzalloc(JENT_MEMORY_SIZE); if (!entropy_collector->mem) { jent_zfree(entropy_collector); return NULL; } entropy_collector->memblocksize = CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKSIZE; entropy_collector->memblocks = CONFIG_CRYPTO_JITTERENTROPY_MEMORY_BLOCKS; entropy_collector->memaccessloops = JENT_MEMORY_ACCESSLOOPS; } /* verify and set the oversampling rate */ if (osr == 0) osr = 1; /* H_submitter = 1 / osr */ entropy_collector->osr = osr; entropy_collector->flags = flags; entropy_collector->hash_state = hash_state; /* Initialize the APT */ jent_apt_init(entropy_collector, osr); /* fill the data pad with non-zero values */ jent_gen_entropy(entropy_collector); return entropy_collector; } void jent_entropy_collector_free(struct rand_data *entropy_collector) { jent_kvzfree(entropy_collector->mem, JENT_MEMORY_SIZE); entropy_collector->mem = NULL; jent_zfree(entropy_collector); } int jent_entropy_init(unsigned int osr, unsigned int flags, void *hash_state, struct rand_data *p_ec) { /* * If caller provides an allocated ec, reuse it which implies that the * health test entropy data is used to further still the available * entropy pool. */ struct rand_data *ec = p_ec; int i, time_backwards = 0, ret = 0, ec_free = 0; unsigned int health_test_result; if (!ec) { ec = jent_entropy_collector_alloc(osr, flags, hash_state); if (!ec) return JENT_EMEM; ec_free = 1; } else { /* Reset the APT */ jent_apt_reset(ec, 0); /* Ensure that a new APT base is obtained */ ec->apt_base_set = 0; /* Reset the RCT */ ec->rct_count = 0; /* Reset intermittent, leave permanent health test result */ ec->health_failure &= (~JENT_RCT_FAILURE); ec->health_failure &= (~JENT_APT_FAILURE); } /* We could perform statistical tests here, but the problem is * that we only have a few loop counts to do testing. These * loop counts may show some slight skew and we produce * false positives. * * Moreover, only old systems show potentially problematic * jitter entropy that could potentially be caught here. But * the RNG is intended for hardware that is available or widely * used, but not old systems that are long out of favor. Thus, * no statistical tests. */ /* * We could add a check for system capabilities such as clock_getres or * check for CONFIG_X86_TSC, but it does not make much sense as the * following sanity checks verify that we have a high-resolution * timer. */ /* * TESTLOOPCOUNT needs some loops to identify edge systems. 100 is * definitely too little. * * SP800-90B requires at least 1024 initial test cycles. */ #define TESTLOOPCOUNT 1024 #define CLEARCACHE 100 for (i = 0; (TESTLOOPCOUNT + CLEARCACHE) > i; i++) { __u64 start_time = 0, end_time = 0, delta = 0; /* Invoke core entropy collection logic */ jent_measure_jitter(ec, &delta); end_time = ec->prev_time; start_time = ec->prev_time - delta; /* test whether timer works */ if (!start_time || !end_time) { ret = JENT_ENOTIME; goto out; } /* * test whether timer is fine grained enough to provide * delta even when called shortly after each other -- this * implies that we also have a high resolution timer */ if (!delta || (end_time == start_time)) { ret = JENT_ECOARSETIME; goto out; } /* * up to here we did not modify any variable that will be * evaluated later, but we already performed some work. Thus we * already have had an impact on the caches, branch prediction, * etc. with the goal to clear it to get the worst case * measurements. */ if (i < CLEARCACHE) continue; /* test whether we have an increasing timer */ if (!(end_time > start_time)) time_backwards++; } /* * we allow up to three times the time running backwards. * CLOCK_REALTIME is affected by adjtime and NTP operations. Thus, * if such an operation just happens to interfere with our test, it * should not fail. The value of 3 should cover the NTP case being * performed during our test run. */ if (time_backwards > 3) { ret = JENT_ENOMONOTONIC; goto out; } /* Did we encounter a health test failure? */ health_test_result = jent_health_failure(ec); if (health_test_result) { ret = (health_test_result & JENT_RCT_FAILURE) ? JENT_ERCT : JENT_EHEALTH; goto out; } out: if (ec_free) jent_entropy_collector_free(ec); return ret; }
111 111 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 // SPDX-License-Identifier: GPL-2.0-or-later /* mpi-sub-ui.c - Subtract an unsigned integer from an MPI. * * Copyright 1991, 1993, 1994, 1996, 1999-2002, 2004, 2012, 2013, 2015 * Free Software Foundation, Inc. * * This file was based on the GNU MP Library source file: * https://gmplib.org/repo/gmp-6.2/file/510b83519d1c/mpz/aors_ui.h * * The GNU MP Library is free software; you can redistribute it and/or modify * it under the terms of either: * * * the GNU Lesser General Public License as published by the Free * Software Foundation; either version 3 of the License, or (at your * option) any later version. * * or * * * the GNU General Public License as published by the Free Software * Foundation; either version 2 of the License, or (at your option) any * later version. * * or both in parallel, as here. * * The GNU MP Library is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * for more details. * * You should have received copies of the GNU General Public License and the * GNU Lesser General Public License along with the GNU MP Library. If not, * see https://www.gnu.org/licenses/. */ #include <linux/export.h> #include "mpi-internal.h" int mpi_sub_ui(MPI w, MPI u, unsigned long vval) { if (u->nlimbs == 0) { if (mpi_resize(w, 1) < 0) return -ENOMEM; w->d[0] = vval; w->nlimbs = (vval != 0); w->sign = (vval != 0); return 0; } /* If not space for W (and possible carry), increase space. */ if (mpi_resize(w, u->nlimbs + 1)) return -ENOMEM; if (u->sign) { mpi_limb_t cy; cy = mpihelp_add_1(w->d, u->d, u->nlimbs, (mpi_limb_t) vval); w->d[u->nlimbs] = cy; w->nlimbs = u->nlimbs + cy; w->sign = 1; } else { /* The signs are different. Need exact comparison to determine * which operand to subtract from which. */ if (u->nlimbs == 1 && u->d[0] < vval) { w->d[0] = vval - u->d[0]; w->nlimbs = 1; w->sign = 1; } else { mpihelp_sub_1(w->d, u->d, u->nlimbs, (mpi_limb_t) vval); /* Size can decrease with at most one limb. */ w->nlimbs = (u->nlimbs - (w->d[u->nlimbs - 1] == 0)); w->sign = 0; } } mpi_normalize(w); return 0; } EXPORT_SYMBOL_GPL(mpi_sub_ui);
286 286 14 301 14 287 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 /* * Constant-time equality testing of memory regions. * * Authors: * * James Yonan <james@openvpn.net> * Daniel Borkmann <dborkman@redhat.com> * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. * * GPL LICENSE SUMMARY * * Copyright(c) 2013 OpenVPN Technologies, Inc. All rights reserved. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. * The full GNU General Public License is included in this distribution * in the file called LICENSE.GPL. * * BSD LICENSE * * Copyright(c) 2013 OpenVPN Technologies, Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * * Neither the name of OpenVPN Technologies nor the names of its * contributors may be used to endorse or promote products derived * from this software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <crypto/algapi.h> #include <linux/export.h> #include <linux/module.h> #include <linux/unaligned.h> /* Generic path for arbitrary size */ static inline unsigned long __crypto_memneq_generic(const void *a, const void *b, size_t size) { unsigned long neq = 0; #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) while (size >= sizeof(unsigned long)) { neq |= get_unaligned((unsigned long *)a) ^ get_unaligned((unsigned long *)b); OPTIMIZER_HIDE_VAR(neq); a += sizeof(unsigned long); b += sizeof(unsigned long); size -= sizeof(unsigned long); } #endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ while (size > 0) { neq |= *(unsigned char *)a ^ *(unsigned char *)b; OPTIMIZER_HIDE_VAR(neq); a += 1; b += 1; size -= 1; } return neq; } /* Loop-free fast-path for frequently used 16-byte size */ static inline unsigned long __crypto_memneq_16(const void *a, const void *b) { unsigned long neq = 0; #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS if (sizeof(unsigned long) == 8) { neq |= get_unaligned((unsigned long *)a) ^ get_unaligned((unsigned long *)b); OPTIMIZER_HIDE_VAR(neq); neq |= get_unaligned((unsigned long *)(a + 8)) ^ get_unaligned((unsigned long *)(b + 8)); OPTIMIZER_HIDE_VAR(neq); } else if (sizeof(unsigned int) == 4) { neq |= get_unaligned((unsigned int *)a) ^ get_unaligned((unsigned int *)b); OPTIMIZER_HIDE_VAR(neq); neq |= get_unaligned((unsigned int *)(a + 4)) ^ get_unaligned((unsigned int *)(b + 4)); OPTIMIZER_HIDE_VAR(neq); neq |= get_unaligned((unsigned int *)(a + 8)) ^ get_unaligned((unsigned int *)(b + 8)); OPTIMIZER_HIDE_VAR(neq); neq |= get_unaligned((unsigned int *)(a + 12)) ^ get_unaligned((unsigned int *)(b + 12)); OPTIMIZER_HIDE_VAR(neq); } else #endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */ { neq |= *(unsigned char *)(a) ^ *(unsigned char *)(b); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+1) ^ *(unsigned char *)(b+1); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+2) ^ *(unsigned char *)(b+2); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+3) ^ *(unsigned char *)(b+3); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+4) ^ *(unsigned char *)(b+4); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+5) ^ *(unsigned char *)(b+5); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+6) ^ *(unsigned char *)(b+6); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+7) ^ *(unsigned char *)(b+7); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+8) ^ *(unsigned char *)(b+8); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+9) ^ *(unsigned char *)(b+9); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+10) ^ *(unsigned char *)(b+10); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+11) ^ *(unsigned char *)(b+11); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+12) ^ *(unsigned char *)(b+12); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+13) ^ *(unsigned char *)(b+13); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+14) ^ *(unsigned char *)(b+14); OPTIMIZER_HIDE_VAR(neq); neq |= *(unsigned char *)(a+15) ^ *(unsigned char *)(b+15); OPTIMIZER_HIDE_VAR(neq); } return neq; } /* Compare two areas of memory without leaking timing information, * and with special optimizations for common sizes. Users should * not call this function directly, but should instead use * crypto_memneq defined in crypto/algapi.h. */ noinline unsigned long __crypto_memneq(const void *a, const void *b, size_t size) { switch (size) { case 16: return __crypto_memneq_16(a, b); default: return __crypto_memneq_generic(a, b, size); } } EXPORT_SYMBOL(__crypto_memneq);
19 19 10 9 37 37 1 2 36 34 33 34 34 15 9 10 4 18 2 18 19 19 19 2 326 79 294 323 5 5 5 5 3 3 3 3 3 21 3 21 12 8 2 8 13 9 7 2 4 7 22 22 22 140 3 143 142 143 487 490 21 21 21 31 176 6 31 858 841 750 117 750 2 750 750 534 118 430 301 304 2 6 363 365 4 25 268 273 268 4 2 2 4 1 5 1 2 1 8 21 10 2 9 21 20 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * Copyright (c) 1999-2000 Cisco, Inc. * Copyright (c) 1999-2001 Motorola, Inc. * Copyright (c) 2001-2003 International Business Machines, Corp. * Copyright (c) 2001 Intel Corp. * Copyright (c) 2001 Nokia, Inc. * Copyright (c) 2001 La Monte H.P. Yarroll * * This file is part of the SCTP kernel implementation * * These functions handle all input from the IP layer into SCTP. * * Please send any bug reports or fixes you make to the * email address(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * La Monte H.P. Yarroll <piggy@acm.org> * Karl Knutson <karl@athena.chicago.il.us> * Xingang Guo <xingang.guo@intel.com> * Jon Grimm <jgrimm@us.ibm.com> * Hui Huang <hui.huang@nokia.com> * Daisy Chang <daisyc@us.ibm.com> * Sridhar Samudrala <sri@us.ibm.com> * Ardelle Fan <ardelle.fan@intel.com> */ #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> #include <linux/ip.h> #include <linux/time.h> /* For struct timeval */ #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/snmp.h> #include <net/sock.h> #include <net/xfrm.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/checksum.h> #include <net/net_namespace.h> #include <linux/rhashtable.h> #include <net/sock_reuseport.h> /* Forward declarations for internal helpers. */ static int sctp_rcv_ootb(struct sk_buff *); static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif); static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, const union sctp_addr *daddr, int dif, int sdif); static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, const union sctp_addr *peer, struct sctp_transport **pt, int dif, int sdif); static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb); /* Calculate the SCTP checksum of an SCTP packet. */ static inline int sctp_rcv_checksum(struct net *net, struct sk_buff *skb) { struct sctphdr *sh = sctp_hdr(skb); __le32 cmp = sh->checksum; __le32 val = sctp_compute_cksum(skb, 0); if (val != cmp) { /* CRC failure, dump it. */ __SCTP_INC_STATS(net, SCTP_MIB_CHECKSUMERRORS); return -1; } return 0; } /* * This is the routine which IP calls when receiving an SCTP packet. */ int sctp_rcv(struct sk_buff *skb) { struct sock *sk; struct sctp_association *asoc; struct sctp_endpoint *ep = NULL; struct sctp_ep_common *rcvr; struct sctp_transport *transport = NULL; struct sctp_chunk *chunk; union sctp_addr src; union sctp_addr dest; int family; struct sctp_af *af; struct net *net = dev_net(skb->dev); bool is_gso = skb_is_gso(skb) && skb_is_gso_sctp(skb); int dif, sdif; if (skb->pkt_type != PACKET_HOST) goto discard_it; __SCTP_INC_STATS(net, SCTP_MIB_INSCTPPACKS); /* If packet is too small to contain a single chunk, let's not * waste time on it anymore. */ if (skb->len < sizeof(struct sctphdr) + sizeof(struct sctp_chunkhdr) + skb_transport_offset(skb)) goto discard_it; /* If the packet is fragmented and we need to do crc checking, * it's better to just linearize it otherwise crc computing * takes longer. */ if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) || !pskb_may_pull(skb, sizeof(struct sctphdr))) goto discard_it; /* Pull up the IP header. */ __skb_pull(skb, skb_transport_offset(skb)); skb->csum_valid = 0; /* Previous value not applicable */ if (skb_csum_unnecessary(skb)) __skb_decr_checksum_unnecessary(skb); else if (!sctp_checksum_disable && !is_gso && sctp_rcv_checksum(net, skb) < 0) goto discard_it; skb->csum_valid = 1; __skb_pull(skb, sizeof(struct sctphdr)); family = ipver2af(ip_hdr(skb)->version); af = sctp_get_af_specific(family); if (unlikely(!af)) goto discard_it; SCTP_INPUT_CB(skb)->af = af; /* Initialize local addresses for lookups. */ af->from_skb(&src, skb, 1); af->from_skb(&dest, skb, 0); dif = af->skb_iif(skb); sdif = af->skb_sdif(skb); /* If the packet is to or from a non-unicast address, * silently discard the packet. * * This is not clearly defined in the RFC except in section * 8.4 - OOTB handling. However, based on the book "Stream Control * Transmission Protocol" 2.1, "It is important to note that the * IP address of an SCTP transport address must be a routable * unicast address. In other words, IP multicast addresses and * IP broadcast addresses cannot be used in an SCTP transport * address." */ if (!af->addr_valid(&src, NULL, skb) || !af->addr_valid(&dest, NULL, skb)) goto discard_it; asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport, dif, sdif); if (!asoc) ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src, dif, sdif); /* Retrieve the common input handling substructure. */ rcvr = asoc ? &asoc->base : &ep->base; sk = rcvr->sk; /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * An SCTP packet is called an "out of the blue" (OOTB) * packet if it is correctly formed, i.e., passed the * receiver's checksum check, but the receiver is not * able to identify the association to which this * packet belongs. */ if (!asoc) { if (sctp_rcv_ootb(skb)) { __SCTP_INC_STATS(net, SCTP_MIB_OUTOFBLUES); goto discard_release; } } if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) goto discard_release; nf_reset_ct(skb); if (sk_filter(sk, skb)) goto discard_release; /* Create an SCTP packet structure. */ chunk = sctp_chunkify(skb, asoc, sk, GFP_ATOMIC); if (!chunk) goto discard_release; SCTP_INPUT_CB(skb)->chunk = chunk; /* Remember what endpoint is to handle this packet. */ chunk->rcvr = rcvr; /* Remember the SCTP header. */ chunk->sctp_hdr = sctp_hdr(skb); /* Set the source and destination addresses of the incoming chunk. */ sctp_init_addrs(chunk, &src, &dest); /* Remember where we came from. */ chunk->transport = transport; /* Acquire access to the sock lock. Note: We are safe from other * bottom halves on this lock, but a user may be in the lock too, * so check if it is busy. */ bh_lock_sock(sk); if (sk != rcvr->sk) { /* Our cached sk is different from the rcvr->sk. This is * because migrate()/accept() may have moved the association * to a new socket and released all the sockets. So now we * are holding a lock on the old socket while the user may * be doing something with the new socket. Switch our veiw * of the current sk. */ bh_unlock_sock(sk); sk = rcvr->sk; bh_lock_sock(sk); } if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sctp_add_backlog(sk, skb)) { bh_unlock_sock(sk); sctp_chunk_free(chunk); skb = NULL; /* sctp_chunk_free already freed the skb */ goto discard_release; } __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_BACKLOG); } else { __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_SOFTIRQ); sctp_inq_push(&chunk->rcvr->inqueue, chunk); } bh_unlock_sock(sk); /* Release the asoc/ep ref we took in the lookup calls. */ if (transport) sctp_transport_put(transport); else sctp_endpoint_put(ep); return 0; discard_it: __SCTP_INC_STATS(net, SCTP_MIB_IN_PKT_DISCARDS); kfree_skb(skb); return 0; discard_release: /* Release the asoc/ep ref we took in the lookup calls. */ if (transport) sctp_transport_put(transport); else sctp_endpoint_put(ep); goto discard_it; } /* Process the backlog queue of the socket. Every skb on * the backlog holds a ref on an association or endpoint. * We hold this ref throughout the state machine to make * sure that the structure we need is still around. */ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; struct sctp_inq *inqueue = &chunk->rcvr->inqueue; struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = NULL; int backloged = 0; rcvr = chunk->rcvr; /* If the rcvr is dead then the association or endpoint * has been deleted and we can safely drop the chunk * and refs that we are holding. */ if (rcvr->dead) { sctp_chunk_free(chunk); goto done; } if (unlikely(rcvr->sk != sk)) { /* In this case, the association moved from one socket to * another. We are currently sitting on the backlog of the * old socket, so we need to move. * However, since we are here in the process context we * need to take make sure that the user doesn't own * the new socket when we process the packet. * If the new socket is user-owned, queue the chunk to the * backlog of the new socket without dropping any refs. * Otherwise, we can safely push the chunk on the inqueue. */ sk = rcvr->sk; local_bh_disable(); bh_lock_sock(sk); if (sock_owned_by_user(sk) || !sctp_newsk_ready(sk)) { if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) sctp_chunk_free(chunk); else backloged = 1; } else sctp_inq_push(inqueue, chunk); bh_unlock_sock(sk); local_bh_enable(); /* If the chunk was backloged again, don't drop refs */ if (backloged) return 0; } else { if (!sctp_newsk_ready(sk)) { if (!sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) return 0; sctp_chunk_free(chunk); } else { sctp_inq_push(inqueue, chunk); } } done: /* Release the refs we took in sctp_add_backlog */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) sctp_transport_put(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_put(sctp_ep(rcvr)); else BUG(); return 0; } static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = chunk->rcvr; int ret; ret = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)); if (!ret) { /* Hold the assoc/ep while hanging on the backlog queue. * This way, we know structures we need will not disappear * from us */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) sctp_transport_hold(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_hold(sctp_ep(rcvr)); else BUG(); } return ret; } /* Handle icmp frag needed error. */ void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { if (!t || (t->pathmtu <= pmtu && t->pl.probe_size + sctp_transport_pl_hlen(t) <= pmtu)) return; if (sock_owned_by_user(sk)) { atomic_set(&t->mtu_info, pmtu); asoc->pmtu_pending = 1; t->pmtu_pending = 1; return; } if (!(t->param_flags & SPP_PMTUD_ENABLE)) /* We can't allow retransmitting in such case, as the * retransmission would be sized just as before, and thus we * would get another icmp, and retransmit again. */ return; /* Update transports view of the MTU. Return if no update was needed. * If an update wasn't needed/possible, it also doesn't make sense to * try to retransmit now. */ if (!sctp_transport_update_pmtu(t, pmtu)) return; /* Update association pmtu. */ sctp_assoc_sync_pmtu(asoc); /* Retransmit with the new pmtu setting. */ sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } void sctp_icmp_redirect(struct sock *sk, struct sctp_transport *t, struct sk_buff *skb) { struct dst_entry *dst; if (sock_owned_by_user(sk) || !t) return; dst = sctp_transport_dst_check(t); if (dst) dst->ops->redirect(dst, sk, skb); } /* * SCTP Implementer's Guide, 2.37 ICMP handling procedures * * ICMP8) If the ICMP code is a "Unrecognized next header type encountered" * or a "Protocol Unreachable" treat this message as an abort * with the T bit set. * * This function sends an event to the state machine, which will abort the * association. * */ void sctp_icmp_proto_unreachable(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t) { if (sock_owned_by_user(sk)) { if (timer_pending(&t->proto_unreach_timer)) return; else { if (!mod_timer(&t->proto_unreach_timer, jiffies + (HZ/20))) sctp_transport_hold(t); } } else { struct net *net = sock_net(sk); pr_debug("%s: unrecognized next header type " "encountered!\n", __func__); if (timer_delete(&t->proto_unreach_timer)) sctp_transport_put(t); sctp_do_sm(net, SCTP_EVENT_T_OTHER, SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), asoc->state, asoc->ep, asoc, t, GFP_ATOMIC); } } /* Common lookup code for icmp/icmpv6 error handler. */ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, struct sctphdr *sctphdr, struct sctp_association **app, struct sctp_transport **tpp) { struct sctp_init_chunk *chunkhdr, _chunkhdr; union sctp_addr saddr; union sctp_addr daddr; struct sctp_af *af; struct sock *sk = NULL; struct sctp_association *asoc; struct sctp_transport *transport = NULL; __u32 vtag = ntohl(sctphdr->vtag); int sdif = inet_sdif(skb); int dif = inet_iif(skb); *app = NULL; *tpp = NULL; af = sctp_get_af_specific(family); if (unlikely(!af)) { return NULL; } /* Initialize local addresses for lookups. */ af->from_skb(&saddr, skb, 1); af->from_skb(&daddr, skb, 0); /* Look for an association that matches the incoming ICMP error * packet. */ asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport, dif, sdif); if (!asoc) return NULL; sk = asoc->base.sk; /* RFC 4960, Appendix C. ICMP Handling * * ICMP6) An implementation MUST validate that the Verification Tag * contained in the ICMP message matches the Verification Tag of * the peer. If the Verification Tag is not 0 and does NOT * match, discard the ICMP message. If it is 0 and the ICMP * message contains enough bytes to verify that the chunk type is * an INIT chunk and that the Initiate Tag matches the tag of the * peer, continue with ICMP7. If the ICMP message is too short * or the chunk type or the Initiate Tag does not match, silently * discard the packet. */ if (vtag == 0) { /* chunk header + first 4 octects of init header */ chunkhdr = skb_header_pointer(skb, skb_transport_offset(skb) + sizeof(struct sctphdr), sizeof(struct sctp_chunkhdr) + sizeof(__be32), &_chunkhdr); if (!chunkhdr || chunkhdr->chunk_hdr.type != SCTP_CID_INIT || ntohl(chunkhdr->init_hdr.init_tag) != asoc->c.my_vtag) goto out; } else if (vtag != asoc->c.peer_vtag) { goto out; } bh_lock_sock(sk); /* If too many ICMPs get dropped on busy * servers this needs to be solved differently. */ if (sock_owned_by_user(sk)) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); *app = asoc; *tpp = transport; return sk; out: sctp_transport_put(transport); return NULL; } /* Common cleanup code for icmp/icmpv6 error handler. */ void sctp_err_finish(struct sock *sk, struct sctp_transport *t) __releases(&((__sk)->sk_lock.slock)) { bh_unlock_sock(sk); sctp_transport_put(t); } static void sctp_v4_err_handle(struct sctp_transport *t, struct sk_buff *skb, __u8 type, __u8 code, __u32 info) { struct sctp_association *asoc = t->asoc; struct sock *sk = asoc->base.sk; int err = 0; switch (type) { case ICMP_PARAMETERPROB: err = EPROTO; break; case ICMP_DEST_UNREACH: if (code > NR_ICMP_UNREACH) return; if (code == ICMP_FRAG_NEEDED) { sctp_icmp_frag_needed(sk, asoc, t, SCTP_TRUNC4(info)); return; } if (code == ICMP_PROT_UNREACH) { sctp_icmp_proto_unreachable(sk, asoc, t); return; } err = icmp_err_convert[code].errno; break; case ICMP_TIME_EXCEEDED: if (code == ICMP_EXC_FRAGTIME) return; err = EHOSTUNREACH; break; case ICMP_REDIRECT: sctp_icmp_redirect(sk, t, skb); return; default: return; } if (!sock_owned_by_user(sk) && inet_test_bit(RECVERR, sk)) { sk->sk_err = err; sk_error_report(sk); } else { /* Only an error on timeout */ WRITE_ONCE(sk->sk_err_soft, err); } } /* * This routine is called by the ICMP module when it gets some * sort of error condition. If err < 0 then the socket should * be closed and the error returned to the user. If err > 0 * it's just the icmp type << 8 | icmp code. After adjustment * header points to the first 8 bytes of the sctp header. We need * to find the appropriate port. * * The locking strategy used here is very "optimistic". When * someone else accesses the socket the ICMP is just dropped * and for some paths there is no check at all. * A more general error queue to queue errors for later handling * is probably better. * */ int sctp_v4_err(struct sk_buff *skb, __u32 info) { const struct iphdr *iph = (const struct iphdr *)skb->data; const int type = icmp_hdr(skb)->type; const int code = icmp_hdr(skb)->code; struct net *net = dev_net(skb->dev); struct sctp_transport *transport; struct sctp_association *asoc; __u16 saveip, savesctp; struct sock *sk; /* Fix up skb to look at the embedded net header. */ saveip = skb->network_header; savesctp = skb->transport_header; skb_reset_network_header(skb); skb_set_transport_header(skb, iph->ihl * 4); sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &transport); /* Put back, the original values. */ skb->network_header = saveip; skb->transport_header = savesctp; if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } sctp_v4_err_handle(transport, skb, type, code, info); sctp_err_finish(sk, transport); return 0; } int sctp_udp_v4_err(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sctp_association *asoc; struct sctp_transport *t; struct icmphdr *hdr; __u32 info = 0; skb->transport_header += sizeof(struct udphdr); sk = sctp_err_lookup(net, AF_INET, skb, sctp_hdr(skb), &asoc, &t); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; } skb->transport_header -= sizeof(struct udphdr); hdr = (struct icmphdr *)(skb_network_header(skb) - sizeof(struct icmphdr)); if (hdr->type == ICMP_REDIRECT) { /* can't be handled without outer iphdr known, leave it to udp_err */ sctp_err_finish(sk, t); return 0; } if (hdr->type == ICMP_DEST_UNREACH && hdr->code == ICMP_FRAG_NEEDED) info = ntohs(hdr->un.frag.mtu); sctp_v4_err_handle(t, skb, hdr->type, hdr->code, info); sctp_err_finish(sk, t); return 1; } /* * RFC 2960, 8.4 - Handle "Out of the blue" Packets. * * This function scans all the chunks in the OOTB packet to determine if * the packet should be discarded right away. If a response might be needed * for this packet, or, if further processing is possible, the packet will * be queued to a proper inqueue for the next phase of handling. * * Output: * Return 0 - If further processing is needed. * Return 1 - If the packet can be discarded right away. */ static int sctp_rcv_ootb(struct sk_buff *skb) { struct sctp_chunkhdr *ch, _ch; int ch_end, offset = 0; /* Scan through all the chunks in the packet. */ do { /* Make sure we have at least the header there */ if (offset + sizeof(_ch) > skb->len) break; ch = skb_header_pointer(skb, offset, sizeof(*ch), &_ch); /* Break out if chunk length is less then minimal. */ if (!ch || ntohs(ch->length) < sizeof(_ch)) break; ch_end = offset + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb->len) break; /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the * receiver MUST silently discard the OOTB packet and take no * further action. */ if (SCTP_CID_ABORT == ch->type) goto discard; /* RFC 8.4, 6) If the packet contains a SHUTDOWN COMPLETE * chunk, the receiver should silently discard the packet * and take no further action. */ if (SCTP_CID_SHUTDOWN_COMPLETE == ch->type) goto discard; /* RFC 4460, 2.11.2 * This will discard packets with INIT chunk bundled as * subsequent chunks in the packet. When INIT is first, * the normal INIT processing will discard the chunk. */ if (SCTP_CID_INIT == ch->type && (void *)ch != skb->data) goto discard; offset = ch_end; } while (ch_end < skb->len); return 0; discard: return 1; } /* Insert endpoint into the hash table. */ static int __sctp_hash_endpoint(struct sctp_endpoint *ep) { struct sock *sk = ep->base.sk; struct net *net = sock_net(sk); struct sctp_hashbucket *head; int err = 0; ep->hashent = sctp_ep_hashfn(net, ep->base.bind_addr.port); head = &sctp_ep_hashtable[ep->hashent]; write_lock(&head->lock); if (sk->sk_reuseport) { bool any = sctp_is_ep_boundall(sk); struct sctp_endpoint *ep2; struct list_head *list; int cnt = 0; err = 1; list_for_each(list, &ep->base.bind_addr.address_list) cnt++; sctp_for_each_hentry(ep2, &head->chain) { struct sock *sk2 = ep2->base.sk; if (!net_eq(sock_net(sk2), net) || sk2 == sk || !uid_eq(sk_uid(sk2), sk_uid(sk)) || !sk2->sk_reuseport) continue; err = sctp_bind_addrs_check(sctp_sk(sk2), sctp_sk(sk), cnt); if (!err) { err = reuseport_add_sock(sk, sk2, any); if (err) goto out; break; } else if (err < 0) { goto out; } } if (err) { err = reuseport_alloc(sk, any); if (err) goto out; } } hlist_add_head(&ep->node, &head->chain); out: write_unlock(&head->lock); return err; } /* Add an endpoint to the hash. Local BH-safe. */ int sctp_hash_endpoint(struct sctp_endpoint *ep) { int err; local_bh_disable(); err = __sctp_hash_endpoint(ep); local_bh_enable(); return err; } /* Remove endpoint from the hash table. */ static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) { struct sock *sk = ep->base.sk; struct sctp_hashbucket *head; ep->hashent = sctp_ep_hashfn(sock_net(sk), ep->base.bind_addr.port); head = &sctp_ep_hashtable[ep->hashent]; write_lock(&head->lock); if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_detach_sock(sk); hlist_del_init(&ep->node); write_unlock(&head->lock); } /* Remove endpoint from the hash. Local BH-safe. */ void sctp_unhash_endpoint(struct sctp_endpoint *ep) { local_bh_disable(); __sctp_unhash_endpoint(ep); local_bh_enable(); } static inline __u32 sctp_hashfn(const struct net *net, __be16 lport, const union sctp_addr *paddr, __u32 seed) { __u32 addr; if (paddr->sa.sa_family == AF_INET6) addr = jhash(&paddr->v6.sin6_addr, 16, seed); else addr = (__force __u32)paddr->v4.sin_addr.s_addr; return jhash_3words(addr, ((__force __u32)paddr->v4.sin_port) << 16 | (__force __u32)lport, net_hash_mix(net), seed); } /* Look up an endpoint. */ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint( struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct sctp_hashbucket *head; struct sctp_endpoint *ep; struct sock *sk; __be16 lport; int hash; lport = laddr->v4.sin_port; hash = sctp_ep_hashfn(net, ntohs(lport)); head = &sctp_ep_hashtable[hash]; read_lock(&head->lock); sctp_for_each_hentry(ep, &head->chain) { if (sctp_endpoint_is_match(ep, net, laddr, dif, sdif)) goto hit; } ep = sctp_sk(net->sctp.ctl_sock)->ep; hit: sk = ep->base.sk; if (sk->sk_reuseport) { __u32 phash = sctp_hashfn(net, lport, paddr, 0); sk = reuseport_select_sock(sk, phash, skb, sizeof(struct sctphdr)); if (sk) ep = sctp_sk(sk)->ep; } sctp_endpoint_hold(ep); read_unlock(&head->lock); return ep; } /* rhashtable for transport */ struct sctp_hash_cmp_arg { const union sctp_addr *paddr; const struct net *net; __be16 lport; }; static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { struct sctp_transport *t = (struct sctp_transport *)ptr; const struct sctp_hash_cmp_arg *x = arg->key; int err = 1; if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr)) return err; if (!sctp_transport_hold(t)) return err; if (!net_eq(t->asoc->base.net, x->net)) goto out; if (x->lport != htons(t->asoc->base.bind_addr.port)) goto out; err = 0; out: sctp_transport_put(t); return err; } static inline __u32 sctp_hash_obj(const void *data, u32 len, u32 seed) { const struct sctp_transport *t = data; return sctp_hashfn(t->asoc->base.net, htons(t->asoc->base.bind_addr.port), &t->ipaddr, seed); } static inline __u32 sctp_hash_key(const void *data, u32 len, u32 seed) { const struct sctp_hash_cmp_arg *x = data; return sctp_hashfn(x->net, x->lport, x->paddr, seed); } static const struct rhashtable_params sctp_hash_params = { .head_offset = offsetof(struct sctp_transport, node), .hashfn = sctp_hash_key, .obj_hashfn = sctp_hash_obj, .obj_cmpfn = sctp_hash_cmp, .automatic_shrinking = true, }; int sctp_transport_hashtable_init(void) { return rhltable_init(&sctp_transport_hashtable, &sctp_hash_params); } void sctp_transport_hashtable_destroy(void) { rhltable_destroy(&sctp_transport_hashtable); } int sctp_hash_transport(struct sctp_transport *t) { struct sctp_transport *transport; struct rhlist_head *tmp, *list; struct sctp_hash_cmp_arg arg; int err; if (t->asoc->temp) return 0; arg.net = t->asoc->base.net; arg.paddr = &t->ipaddr; arg.lport = htons(t->asoc->base.bind_addr.port); rcu_read_lock(); list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(transport, tmp, list, node) if (transport->asoc->ep == t->asoc->ep) { rcu_read_unlock(); return -EEXIST; } rcu_read_unlock(); err = rhltable_insert_key(&sctp_transport_hashtable, &arg, &t->node, sctp_hash_params); if (err) pr_err_once("insert transport fail, errno %d\n", err); return err; } void sctp_unhash_transport(struct sctp_transport *t) { if (t->asoc->temp) return; rhltable_remove(&sctp_transport_hashtable, &t->node, sctp_hash_params); } bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { bool l3mdev_accept = true; #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) l3mdev_accept = !!READ_ONCE(net->sctp.l3mdev_accept); #endif return inet_bound_dev_eq(l3mdev_accept, bound_dev_if, dif, sdif); } /* return a transport with holding it */ struct sctp_transport *sctp_addrs_lookup_transport( struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct rhlist_head *tmp, *list; struct sctp_transport *t; int bound_dev_if; struct sctp_hash_cmp_arg arg = { .paddr = paddr, .net = net, .lport = laddr->v4.sin_port, }; list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(t, tmp, list, node) { if (!sctp_transport_hold(t)) continue; bound_dev_if = READ_ONCE(t->asoc->base.sk->sk_bound_dev_if); if (sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) && sctp_bind_addr_match(&t->asoc->base.bind_addr, laddr, sctp_sk(t->asoc->base.sk))) return t; sctp_transport_put(t); } return NULL; } /* return a transport without holding it, as it's only used under sock lock */ struct sctp_transport *sctp_epaddr_lookup_transport( const struct sctp_endpoint *ep, const union sctp_addr *paddr) { struct rhlist_head *tmp, *list; struct sctp_transport *t; struct sctp_hash_cmp_arg arg = { .paddr = paddr, .net = ep->base.net, .lport = htons(ep->base.bind_addr.port), }; list = rhltable_lookup(&sctp_transport_hashtable, &arg, sctp_hash_params); rhl_for_each_entry_rcu(t, tmp, list, node) if (ep == t->asoc->ep) return t; return NULL; } /* Look up an association. */ static struct sctp_association *__sctp_lookup_association( struct net *net, const union sctp_addr *local, const union sctp_addr *peer, struct sctp_transport **pt, int dif, int sdif) { struct sctp_transport *t; struct sctp_association *asoc = NULL; t = sctp_addrs_lookup_transport(net, local, peer, dif, sdif); if (!t) goto out; asoc = t->asoc; *pt = t; out: return asoc; } /* Look up an association. protected by RCU read lock */ static struct sctp_association *sctp_lookup_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; rcu_read_lock(); asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); rcu_read_unlock(); return asoc; } /* Is there an association matching the given local and peer addresses? */ bool sctp_has_association(struct net *net, const union sctp_addr *laddr, const union sctp_addr *paddr, int dif, int sdif) { struct sctp_transport *transport; if (sctp_lookup_association(net, laddr, paddr, &transport, dif, sdif)) { sctp_transport_put(transport); return true; } return false; } /* * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. * * D) When searching for a matching TCB upon reception of an INIT * or INIT-ACK chunk the receiver SHOULD use not only the * source address of the packet (containing the INIT or * INIT-ACK) but the receiver SHOULD also use all valid * address parameters contained within the chunk. * * 2.18.3 Solution description * * This new text clearly specifies to an implementor the need * to look within the INIT or INIT-ACK. Any implementation that * does not do this, may not be able to establish associations * in certain circumstances. * */ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; union sctp_addr addr; union sctp_addr *paddr = &addr; struct sctphdr *sh = sctp_hdr(skb); union sctp_params params; struct sctp_init_chunk *init; struct sctp_af *af; /* * This code will NOT touch anything inside the chunk--it is * strictly READ-ONLY. * * RFC 2960 3 SCTP packet Format * * Multiple chunks can be bundled into one SCTP packet up to * the MTU size, except for the INIT, INIT ACK, and SHUTDOWN * COMPLETE chunks. These chunks MUST NOT be bundled with any * other chunk in a packet. See Section 6.10 for more details * on chunk bundling. */ /* Find the start of the TLVs and the end of the chunk. This is * the region we search for address parameters. */ init = (struct sctp_init_chunk *)skb->data; /* Walk the parameters looking for embedded addresses. */ sctp_walk_params(params, init) { /* Note: Ignoring hostname addresses. */ af = sctp_get_af_specific(param_type2af(params.p->type)); if (!af) continue; if (!af->from_addr_param(paddr, params.addr, sh->source, 0)) continue; asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); if (asoc) return asoc; } return NULL; } /* ADD-IP, Section 5.2 * When an endpoint receives an ASCONF Chunk from the remote peer * special procedures may be needed to identify the association the * ASCONF Chunk is associated with. To properly find the association * the following procedures SHOULD be followed: * * D2) If the association is not found, use the address found in the * Address Parameter TLV combined with the port number found in the * SCTP common header. If found proceed to rule D4. * * D2-ext) If more than one ASCONF Chunks are packed together, use the * address found in the ASCONF Address Parameter TLV of each of the * subsequent ASCONF Chunks. If found, proceed to rule D4. */ static struct sctp_association *__sctp_rcv_asconf_lookup( struct net *net, struct sctp_chunkhdr *ch, const union sctp_addr *laddr, __be16 peer_port, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_addip_chunk *asconf = (struct sctp_addip_chunk *)ch; struct sctp_af *af; union sctp_addr_param *param; union sctp_addr paddr; if (ntohs(ch->length) < sizeof(*asconf) + sizeof(struct sctp_paramhdr)) return NULL; /* Skip over the ADDIP header and find the Address parameter */ param = (union sctp_addr_param *)(asconf + 1); af = sctp_get_af_specific(param_type2af(param->p.type)); if (unlikely(!af)) return NULL; if (!af->from_addr_param(&paddr, param, peer_port, 0)) return NULL; return __sctp_lookup_association(net, laddr, &paddr, transportp, dif, sdif); } /* SCTP-AUTH, Section 6.3: * If the receiver does not find a STCB for a packet containing an AUTH * chunk as the first chunk and not a COOKIE-ECHO chunk as the second * chunk, it MUST use the chunks after the AUTH chunk to look up an existing * association. * * This means that any chunks that can help us identify the association need * to be looked at to find this association. */ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc = NULL; struct sctp_chunkhdr *ch; int have_auth = 0; unsigned int chunk_num = 1; __u8 *ch_end; /* Walk through the chunks looking for AUTH or ASCONF chunks * to help us find the association. */ ch = (struct sctp_chunkhdr *)skb->data; do { /* Break out if chunk length is less then minimal. */ if (ntohs(ch->length) < sizeof(*ch)) break; ch_end = ((__u8 *)ch) + SCTP_PAD4(ntohs(ch->length)); if (ch_end > skb_tail_pointer(skb)) break; switch (ch->type) { case SCTP_CID_AUTH: have_auth = chunk_num; break; case SCTP_CID_COOKIE_ECHO: /* If a packet arrives containing an AUTH chunk as * a first chunk, a COOKIE-ECHO chunk as the second * chunk, and possibly more chunks after them, and * the receiver does not have an STCB for that * packet, then authentication is based on * the contents of the COOKIE- ECHO chunk. */ if (have_auth == 1 && chunk_num == 2) return NULL; break; case SCTP_CID_ASCONF: if (have_auth || net->sctp.addip_noauth) asoc = __sctp_rcv_asconf_lookup( net, ch, laddr, sctp_hdr(skb)->source, transportp, dif, sdif); break; default: break; } if (asoc) break; ch = (struct sctp_chunkhdr *)ch_end; chunk_num++; } while (ch_end + sizeof(*ch) < skb_tail_pointer(skb)); return asoc; } /* * There are circumstances when we need to look inside the SCTP packet * for information to help us find the association. Examples * include looking inside of INIT/INIT-ACK chunks or after the AUTH * chunks. */ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net, struct sk_buff *skb, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_chunkhdr *ch; /* We do not allow GSO frames here as we need to linearize and * then cannot guarantee frame boundaries. This shouldn't be an * issue as packets hitting this are mostly INIT or INIT-ACK and * those cannot be on GSO-style anyway. */ if (skb_is_gso(skb) && skb_is_gso_sctp(skb)) return NULL; ch = (struct sctp_chunkhdr *)skb->data; /* The code below will attempt to walk the chunk and extract * parameter information. Before we do that, we need to verify * that the chunk length doesn't cause overflow. Otherwise, we'll * walk off the end. */ if (SCTP_PAD4(ntohs(ch->length)) > skb->len) return NULL; /* If this is INIT/INIT-ACK look inside the chunk too. */ if (ch->type == SCTP_CID_INIT || ch->type == SCTP_CID_INIT_ACK) return __sctp_rcv_init_lookup(net, skb, laddr, transportp, dif, sdif); return __sctp_rcv_walk_lookup(net, skb, laddr, transportp, dif, sdif); } /* Lookup an association for an inbound skb. */ static struct sctp_association *__sctp_rcv_lookup(struct net *net, struct sk_buff *skb, const union sctp_addr *paddr, const union sctp_addr *laddr, struct sctp_transport **transportp, int dif, int sdif) { struct sctp_association *asoc; asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif); if (asoc) goto out; /* Further lookup for INIT/INIT-ACK packets. * SCTP Implementors Guide, 2.18 Handling of address * parameters within the INIT or INIT-ACK. */ asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp, dif, sdif); if (asoc) goto out; if (paddr->sa.sa_family == AF_INET) pr_debug("sctp: asoc not found for src:%pI4:%d dst:%pI4:%d\n", &laddr->v4.sin_addr, ntohs(laddr->v4.sin_port), &paddr->v4.sin_addr, ntohs(paddr->v4.sin_port)); else pr_debug("sctp: asoc not found for src:%pI6:%d dst:%pI6:%d\n", &laddr->v6.sin6_addr, ntohs(laddr->v6.sin6_port), &paddr->v6.sin6_addr, ntohs(paddr->v6.sin6_port)); out: return asoc; }
416 416 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 // SPDX-License-Identifier: GPL-2.0 /* * sysctl_net_ipv6.c: sysctl interface to net IPV6 subsystem. * * Changes: * YOSHIFUJI Hideaki @USAGI: added icmp sysctl table. */ #include <linux/mm.h> #include <linux/sysctl.h> #include <linux/in6.h> #include <linux/ipv6.h> #include <linux/slab.h> #include <linux/export.h> #include <net/ndisc.h> #include <net/ipv6.h> #include <net/addrconf.h> #include <net/inet_frag.h> #include <net/netevent.h> #include <net/ip_fib.h> #ifdef CONFIG_NETLABEL #include <net/calipso.h> #endif #include <linux/ioam6.h> static int flowlabel_reflect_max = 0x7; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static u32 rt6_multipath_hash_fields_all_mask = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; static u32 ioam6_id_max = IOAM6_DEFAULT_ID; static u64 ioam6_id_wide_max = IOAM6_DEFAULT_ID_WIDE; static int proc_rt6_multipath_hash_policy(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; net = container_of(table->data, struct net, ipv6.sysctl.multipath_hash_policy); ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); return ret; } static int proc_rt6_multipath_hash_fields(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; net = container_of(table->data, struct net, ipv6.sysctl.multipath_hash_fields); ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); return ret; } static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", .data = &init_net.ipv6.sysctl.bindv6only, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "anycast_src_echo_reply", .data = &init_net.ipv6.sysctl.anycast_src_echo_reply, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "flowlabel_consistency", .data = &init_net.ipv6.sysctl.flowlabel_consistency, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "auto_flowlabels", .data = &init_net.ipv6.sysctl.auto_flowlabels, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra2 = &auto_flowlabels_max }, { .procname = "fwmark_reflect", .data = &init_net.ipv6.sysctl.fwmark_reflect, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "idgen_retries", .data = &init_net.ipv6.sysctl.idgen_retries, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "idgen_delay", .data = &init_net.ipv6.sysctl.idgen_delay, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, { .procname = "flowlabel_state_ranges", .data = &init_net.ipv6.sysctl.flowlabel_state_ranges, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_nonlocal_bind", .data = &init_net.ipv6.sysctl.ip_nonlocal_bind, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "flowlabel_reflect", .data = &init_net.ipv6.sysctl.flowlabel_reflect, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &flowlabel_reflect_max, }, { .procname = "max_dst_opts_number", .data = &init_net.ipv6.sysctl.max_dst_opts_cnt, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_hbh_opts_number", .data = &init_net.ipv6.sysctl.max_hbh_opts_cnt, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_dst_opts_length", .data = &init_net.ipv6.sysctl.max_dst_opts_len, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "max_hbh_length", .data = &init_net.ipv6.sysctl.max_hbh_opts_len, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv6.sysctl.multipath_hash_policy, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_rt6_multipath_hash_policy, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_THREE, }, { .procname = "fib_multipath_hash_fields", .data = &init_net.ipv6.sysctl.multipath_hash_fields, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_rt6_multipath_hash_fields, .extra1 = SYSCTL_ONE, .extra2 = &rt6_multipath_hash_fields_all_mask, }, { .procname = "seg6_flowlabel", .data = &init_net.ipv6.sysctl.seg6_flowlabel, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv6.sysctl.fib_notify_on_flag_change, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { .procname = "ioam6_id", .data = &init_net.ipv6.sysctl.ioam6_id, .maxlen = sizeof(u32), .mode = 0644, .proc_handler = proc_douintvec_minmax, .extra2 = &ioam6_id_max, }, { .procname = "ioam6_id_wide", .data = &init_net.ipv6.sysctl.ioam6_id_wide, .maxlen = sizeof(u64), .mode = 0644, .proc_handler = proc_doulongvec_minmax, .extra2 = &ioam6_id_wide_max, }, }; static struct ctl_table ipv6_rotable[] = { { .procname = "mld_max_msf", .data = &sysctl_mld_max_msf, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec }, { .procname = "mld_qrv", .data = &sysctl_mld_qrv, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, #ifdef CONFIG_NETLABEL { .procname = "calipso_cache_enable", .data = &calipso_cache_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, { .procname = "calipso_cache_bucket_size", .data = &calipso_cache_bucketsize, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif /* CONFIG_NETLABEL */ }; static int __net_init ipv6_sysctl_net_init(struct net *net) { size_t table_size = ARRAY_SIZE(ipv6_table_template); struct ctl_table *ipv6_table; struct ctl_table *ipv6_route_table; struct ctl_table *ipv6_icmp_table; int err, i; err = -ENOMEM; ipv6_table = kmemdup(ipv6_table_template, sizeof(ipv6_table_template), GFP_KERNEL); if (!ipv6_table) goto out; /* Update the variables to point into the current struct net */ for (i = 0; i < table_size; i++) ipv6_table[i].data += (void *)net - (void *)&init_net; ipv6_route_table = ipv6_route_sysctl_init(net); if (!ipv6_route_table) goto out_ipv6_table; ipv6_icmp_table = ipv6_icmp_sysctl_init(net); if (!ipv6_icmp_table) goto out_ipv6_route_table; net->ipv6.sysctl.hdr = register_net_sysctl_sz(net, "net/ipv6", ipv6_table, table_size); if (!net->ipv6.sysctl.hdr) goto out_ipv6_icmp_table; net->ipv6.sysctl.route_hdr = register_net_sysctl_sz(net, "net/ipv6/route", ipv6_route_table, ipv6_route_sysctl_table_size(net)); if (!net->ipv6.sysctl.route_hdr) goto out_unregister_ipv6_table; net->ipv6.sysctl.icmp_hdr = register_net_sysctl_sz(net, "net/ipv6/icmp", ipv6_icmp_table, ipv6_icmp_sysctl_table_size()); if (!net->ipv6.sysctl.icmp_hdr) goto out_unregister_route_table; err = 0; out: return err; out_unregister_route_table: unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr); out_unregister_ipv6_table: unregister_net_sysctl_table(net->ipv6.sysctl.hdr); out_ipv6_icmp_table: kfree(ipv6_icmp_table); out_ipv6_route_table: kfree(ipv6_route_table); out_ipv6_table: kfree(ipv6_table); goto out; } static void __net_exit ipv6_sysctl_net_exit(struct net *net) { const struct ctl_table *ipv6_table; const struct ctl_table *ipv6_route_table; const struct ctl_table *ipv6_icmp_table; ipv6_table = net->ipv6.sysctl.hdr->ctl_table_arg; ipv6_route_table = net->ipv6.sysctl.route_hdr->ctl_table_arg; ipv6_icmp_table = net->ipv6.sysctl.icmp_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv6.sysctl.icmp_hdr); unregister_net_sysctl_table(net->ipv6.sysctl.route_hdr); unregister_net_sysctl_table(net->ipv6.sysctl.hdr); kfree(ipv6_table); kfree(ipv6_route_table); kfree(ipv6_icmp_table); } static struct pernet_operations ipv6_sysctl_net_ops = { .init = ipv6_sysctl_net_init, .exit = ipv6_sysctl_net_exit, }; static struct ctl_table_header *ip6_header; int ipv6_sysctl_register(void) { int err = -ENOMEM; ip6_header = register_net_sysctl(&init_net, "net/ipv6", ipv6_rotable); if (!ip6_header) goto out; err = register_pernet_subsys(&ipv6_sysctl_net_ops); if (err) goto err_pernet; out: return err; err_pernet: unregister_net_sysctl_table(ip6_header); goto out; } void ipv6_sysctl_unregister(void) { unregister_net_sysctl_table(ip6_header); unregister_pernet_subsys(&ipv6_sysctl_net_ops); }
69 101 50 60 51 51 51 8 1 12 8 5 6 2 1 1 1 63 33 33 33 33 5 2 3 84 83 10 73 68 15 84 11 73 1 83 84 11 73 11 73 69 15 69 15 84 5 2 4 1 4 4 2 1 1 1 3 3 2 7 2 5 1 67 5 2 73 16 1 2 2 72 13 1 3 8 4 36 1 8 23 12 36 23 15 1 1 1 1 1 1 2 1 1 90 89 1 6 101 7 95 7 4 6 3 1 1 4 8 8 1 7 1 1 1 3 2 3 1 2 1 2 3 3 3 3 416 416 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 /* * Copyright (c) 2016-2017, Mellanox Technologies. All rights reserved. * Copyright (c) 2016-2017, Dave Watson <davejwatson@fb.com>. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/module.h> #include <net/tcp.h> #include <net/inet_common.h> #include <linux/highmem.h> #include <linux/netdevice.h> #include <linux/sched/signal.h> #include <linux/inetdevice.h> #include <linux/inet_diag.h> #include <net/snmp.h> #include <net/tls.h> #include <net/tls_toe.h> #include "tls.h" MODULE_AUTHOR("Mellanox Technologies"); MODULE_DESCRIPTION("Transport Layer Security Support"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_TCP_ULP("tls"); enum { TLSV4, TLSV6, TLS_NUM_PROTS, }; #define CHECK_CIPHER_DESC(cipher,ci) \ static_assert(cipher ## _IV_SIZE <= TLS_MAX_IV_SIZE); \ static_assert(cipher ## _SALT_SIZE <= TLS_MAX_SALT_SIZE); \ static_assert(cipher ## _REC_SEQ_SIZE <= TLS_MAX_REC_SEQ_SIZE); \ static_assert(cipher ## _TAG_SIZE == TLS_TAG_SIZE); \ static_assert(sizeof_field(struct ci, iv) == cipher ## _IV_SIZE); \ static_assert(sizeof_field(struct ci, key) == cipher ## _KEY_SIZE); \ static_assert(sizeof_field(struct ci, salt) == cipher ## _SALT_SIZE); \ static_assert(sizeof_field(struct ci, rec_seq) == cipher ## _REC_SEQ_SIZE); #define __CIPHER_DESC(ci) \ .iv_offset = offsetof(struct ci, iv), \ .key_offset = offsetof(struct ci, key), \ .salt_offset = offsetof(struct ci, salt), \ .rec_seq_offset = offsetof(struct ci, rec_seq), \ .crypto_info = sizeof(struct ci) #define CIPHER_DESC(cipher,ci,algname,_offloadable) [cipher - TLS_CIPHER_MIN] = { \ .nonce = cipher ## _IV_SIZE, \ .iv = cipher ## _IV_SIZE, \ .key = cipher ## _KEY_SIZE, \ .salt = cipher ## _SALT_SIZE, \ .tag = cipher ## _TAG_SIZE, \ .rec_seq = cipher ## _REC_SEQ_SIZE, \ .cipher_name = algname, \ .offloadable = _offloadable, \ __CIPHER_DESC(ci), \ } #define CIPHER_DESC_NONCE0(cipher,ci,algname,_offloadable) [cipher - TLS_CIPHER_MIN] = { \ .nonce = 0, \ .iv = cipher ## _IV_SIZE, \ .key = cipher ## _KEY_SIZE, \ .salt = cipher ## _SALT_SIZE, \ .tag = cipher ## _TAG_SIZE, \ .rec_seq = cipher ## _REC_SEQ_SIZE, \ .cipher_name = algname, \ .offloadable = _offloadable, \ __CIPHER_DESC(ci), \ } const struct tls_cipher_desc tls_cipher_desc[TLS_CIPHER_MAX + 1 - TLS_CIPHER_MIN] = { CIPHER_DESC(TLS_CIPHER_AES_GCM_128, tls12_crypto_info_aes_gcm_128, "gcm(aes)", true), CIPHER_DESC(TLS_CIPHER_AES_GCM_256, tls12_crypto_info_aes_gcm_256, "gcm(aes)", true), CIPHER_DESC(TLS_CIPHER_AES_CCM_128, tls12_crypto_info_aes_ccm_128, "ccm(aes)", false), CIPHER_DESC_NONCE0(TLS_CIPHER_CHACHA20_POLY1305, tls12_crypto_info_chacha20_poly1305, "rfc7539(chacha20,poly1305)", false), CIPHER_DESC(TLS_CIPHER_SM4_GCM, tls12_crypto_info_sm4_gcm, "gcm(sm4)", false), CIPHER_DESC(TLS_CIPHER_SM4_CCM, tls12_crypto_info_sm4_ccm, "ccm(sm4)", false), CIPHER_DESC(TLS_CIPHER_ARIA_GCM_128, tls12_crypto_info_aria_gcm_128, "gcm(aria)", false), CIPHER_DESC(TLS_CIPHER_ARIA_GCM_256, tls12_crypto_info_aria_gcm_256, "gcm(aria)", false), }; CHECK_CIPHER_DESC(TLS_CIPHER_AES_GCM_128, tls12_crypto_info_aes_gcm_128); CHECK_CIPHER_DESC(TLS_CIPHER_AES_GCM_256, tls12_crypto_info_aes_gcm_256); CHECK_CIPHER_DESC(TLS_CIPHER_AES_CCM_128, tls12_crypto_info_aes_ccm_128); CHECK_CIPHER_DESC(TLS_CIPHER_CHACHA20_POLY1305, tls12_crypto_info_chacha20_poly1305); CHECK_CIPHER_DESC(TLS_CIPHER_SM4_GCM, tls12_crypto_info_sm4_gcm); CHECK_CIPHER_DESC(TLS_CIPHER_SM4_CCM, tls12_crypto_info_sm4_ccm); CHECK_CIPHER_DESC(TLS_CIPHER_ARIA_GCM_128, tls12_crypto_info_aria_gcm_128); CHECK_CIPHER_DESC(TLS_CIPHER_ARIA_GCM_256, tls12_crypto_info_aria_gcm_256); static const struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); static const struct proto *saved_tcpv4_prot; static DEFINE_MUTEX(tcpv4_prot_mutex); static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; static struct proto_ops tls_proto_ops[TLS_NUM_PROTS][TLS_NUM_CONFIG][TLS_NUM_CONFIG]; static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], const struct proto *base); void update_sk_prot(struct sock *sk, struct tls_context *ctx) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; WRITE_ONCE(sk->sk_prot, &tls_prots[ip_ver][ctx->tx_conf][ctx->rx_conf]); WRITE_ONCE(sk->sk_socket->ops, &tls_proto_ops[ip_ver][ctx->tx_conf][ctx->rx_conf]); } int wait_on_pending_writer(struct sock *sk, long *timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); int ret, rc = 0; add_wait_queue(sk_sleep(sk), &wait); while (1) { if (!*timeo) { rc = -EAGAIN; break; } if (signal_pending(current)) { rc = sock_intr_errno(*timeo); break; } ret = sk_wait_event(sk, timeo, !READ_ONCE(sk->sk_write_pending), &wait); if (ret) { if (ret < 0) rc = ret; break; } } remove_wait_queue(sk_sleep(sk), &wait); return rc; } int tls_push_sg(struct sock *sk, struct tls_context *ctx, struct scatterlist *sg, u16 first_offset, int flags) { struct bio_vec bvec; struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES | flags, }; int ret = 0; struct page *p; size_t size; int offset = first_offset; size = sg->length - offset; offset += sg->offset; ctx->splicing_pages = true; while (1) { /* is sending application-limited? */ tcp_rate_check_app_limited(sk); p = sg_page(sg); retry: bvec_set_page(&bvec, p, size, offset); iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, size); ret = tcp_sendmsg_locked(sk, &msg, size); if (ret != size) { if (ret > 0) { offset += ret; size -= ret; goto retry; } offset -= sg->offset; ctx->partially_sent_offset = offset; ctx->partially_sent_record = (void *)sg; ctx->splicing_pages = false; return ret; } put_page(p); sk_mem_uncharge(sk, sg->length); sg = sg_next(sg); if (!sg) break; offset = sg->offset; size = sg->length; } ctx->splicing_pages = false; return 0; } static int tls_handle_open_record(struct sock *sk, int flags) { struct tls_context *ctx = tls_get_ctx(sk); if (tls_is_pending_open_record(ctx)) return ctx->push_pending_record(sk, flags); return 0; } int tls_process_cmsg(struct sock *sk, struct msghdr *msg, unsigned char *record_type) { struct cmsghdr *cmsg; int rc = -EINVAL; for_each_cmsghdr(cmsg, msg) { if (!CMSG_OK(msg, cmsg)) return -EINVAL; if (cmsg->cmsg_level != SOL_TLS) continue; switch (cmsg->cmsg_type) { case TLS_SET_RECORD_TYPE: if (cmsg->cmsg_len < CMSG_LEN(sizeof(*record_type))) return -EINVAL; if (msg->msg_flags & MSG_MORE) return -EINVAL; rc = tls_handle_open_record(sk, msg->msg_flags); if (rc) return rc; *record_type = *(unsigned char *)CMSG_DATA(cmsg); rc = 0; break; default: return -EINVAL; } } return rc; } int tls_push_partial_record(struct sock *sk, struct tls_context *ctx, int flags) { struct scatterlist *sg; u16 offset; sg = ctx->partially_sent_record; offset = ctx->partially_sent_offset; ctx->partially_sent_record = NULL; return tls_push_sg(sk, ctx, sg, offset, flags); } void tls_free_partial_record(struct sock *sk, struct tls_context *ctx) { struct scatterlist *sg; for (sg = ctx->partially_sent_record; sg; sg = sg_next(sg)) { put_page(sg_page(sg)); sk_mem_uncharge(sk, sg->length); } ctx->partially_sent_record = NULL; } static void tls_write_space(struct sock *sk) { struct tls_context *ctx = tls_get_ctx(sk); /* If splicing_pages call lower protocol write space handler * to ensure we wake up any waiting operations there. For example * if splicing pages where to call sk_wait_event. */ if (ctx->splicing_pages) { ctx->sk_write_space(sk); return; } #ifdef CONFIG_TLS_DEVICE if (ctx->tx_conf == TLS_HW) tls_device_write_space(sk, ctx); else #endif tls_sw_write_space(sk, ctx); ctx->sk_write_space(sk); } /** * tls_ctx_free() - free TLS ULP context * @sk: socket to with @ctx is attached * @ctx: TLS context structure * * Free TLS context. If @sk is %NULL caller guarantees that the socket * to which @ctx was attached has no outstanding references. */ void tls_ctx_free(struct sock *sk, struct tls_context *ctx) { if (!ctx) return; memzero_explicit(&ctx->crypto_send, sizeof(ctx->crypto_send)); memzero_explicit(&ctx->crypto_recv, sizeof(ctx->crypto_recv)); mutex_destroy(&ctx->tx_lock); if (sk) kfree_rcu(ctx, rcu); else kfree(ctx); } static void tls_sk_proto_cleanup(struct sock *sk, struct tls_context *ctx, long timeo) { if (unlikely(sk->sk_write_pending) && !wait_on_pending_writer(sk, &timeo)) tls_handle_open_record(sk, 0); /* We need these for tls_sw_fallback handling of other packets */ if (ctx->tx_conf == TLS_SW) { tls_sw_release_resources_tx(sk); TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); } else if (ctx->tx_conf == TLS_HW) { tls_device_free_resources_tx(sk); TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); } if (ctx->rx_conf == TLS_SW) { tls_sw_release_resources_rx(sk); TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); } else if (ctx->rx_conf == TLS_HW) { tls_device_offload_cleanup_rx(sk); TLS_DEC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); } } static void tls_sk_proto_close(struct sock *sk, long timeout) { struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx = tls_get_ctx(sk); long timeo = sock_sndtimeo(sk, 0); bool free_ctx; if (ctx->tx_conf == TLS_SW) tls_sw_cancel_work_tx(ctx); lock_sock(sk); free_ctx = ctx->tx_conf != TLS_HW && ctx->rx_conf != TLS_HW; if (ctx->tx_conf != TLS_BASE || ctx->rx_conf != TLS_BASE) tls_sk_proto_cleanup(sk, ctx, timeo); write_lock_bh(&sk->sk_callback_lock); if (free_ctx) rcu_assign_pointer(icsk->icsk_ulp_data, NULL); WRITE_ONCE(sk->sk_prot, ctx->sk_proto); if (sk->sk_write_space == tls_write_space) sk->sk_write_space = ctx->sk_write_space; write_unlock_bh(&sk->sk_callback_lock); release_sock(sk); if (ctx->tx_conf == TLS_SW) tls_sw_free_ctx_tx(ctx); if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW) tls_sw_strparser_done(ctx); if (ctx->rx_conf == TLS_SW) tls_sw_free_ctx_rx(ctx); ctx->sk_proto->close(sk, timeout); if (free_ctx) tls_ctx_free(sk, ctx); } static __poll_t tls_sk_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait) { struct tls_sw_context_rx *ctx; struct tls_context *tls_ctx; struct sock *sk = sock->sk; struct sk_psock *psock; __poll_t mask = 0; u8 shutdown; int state; mask = tcp_poll(file, sock, wait); state = inet_sk_state_load(sk); shutdown = READ_ONCE(sk->sk_shutdown); if (unlikely(state != TCP_ESTABLISHED || shutdown & RCV_SHUTDOWN)) return mask; tls_ctx = tls_get_ctx(sk); ctx = tls_sw_ctx_rx(tls_ctx); psock = sk_psock_get(sk); if ((skb_queue_empty_lockless(&ctx->rx_list) && !tls_strp_msg_ready(ctx) && sk_psock_queue_empty(psock)) || READ_ONCE(ctx->key_update_pending)) mask &= ~(EPOLLIN | EPOLLRDNORM); if (psock) sk_psock_put(sk, psock); return mask; } static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, int __user *optlen, int tx) { int rc = 0; const struct tls_cipher_desc *cipher_desc; struct tls_context *ctx = tls_get_ctx(sk); struct tls_crypto_info *crypto_info; struct cipher_context *cctx; int len; if (get_user(len, optlen)) return -EFAULT; if (!optval || (len < sizeof(*crypto_info))) { rc = -EINVAL; goto out; } if (!ctx) { rc = -EBUSY; goto out; } /* get user crypto info */ if (tx) { crypto_info = &ctx->crypto_send.info; cctx = &ctx->tx; } else { crypto_info = &ctx->crypto_recv.info; cctx = &ctx->rx; } if (!TLS_CRYPTO_INFO_READY(crypto_info)) { rc = -EBUSY; goto out; } if (len == sizeof(*crypto_info)) { if (copy_to_user(optval, crypto_info, sizeof(*crypto_info))) rc = -EFAULT; goto out; } cipher_desc = get_cipher_desc(crypto_info->cipher_type); if (!cipher_desc || len != cipher_desc->crypto_info) { rc = -EINVAL; goto out; } memcpy(crypto_info_iv(crypto_info, cipher_desc), cctx->iv + cipher_desc->salt, cipher_desc->iv); memcpy(crypto_info_rec_seq(crypto_info, cipher_desc), cctx->rec_seq, cipher_desc->rec_seq); if (copy_to_user(optval, crypto_info, cipher_desc->crypto_info)) rc = -EFAULT; out: return rc; } static int do_tls_getsockopt_tx_zc(struct sock *sk, char __user *optval, int __user *optlen) { struct tls_context *ctx = tls_get_ctx(sk); unsigned int value; int len; if (get_user(len, optlen)) return -EFAULT; if (len != sizeof(value)) return -EINVAL; value = ctx->zerocopy_sendfile; if (copy_to_user(optval, &value, sizeof(value))) return -EFAULT; return 0; } static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval, int __user *optlen) { struct tls_context *ctx = tls_get_ctx(sk); int value, len; if (ctx->prot_info.version != TLS_1_3_VERSION) return -EINVAL; if (get_user(len, optlen)) return -EFAULT; if (len < sizeof(value)) return -EINVAL; value = -EINVAL; if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW) value = ctx->rx_no_pad; if (value < 0) return value; if (put_user(sizeof(value), optlen)) return -EFAULT; if (copy_to_user(optval, &value, sizeof(value))) return -EFAULT; return 0; } static int do_tls_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { int rc = 0; lock_sock(sk); switch (optname) { case TLS_TX: case TLS_RX: rc = do_tls_getsockopt_conf(sk, optval, optlen, optname == TLS_TX); break; case TLS_TX_ZEROCOPY_RO: rc = do_tls_getsockopt_tx_zc(sk, optval, optlen); break; case TLS_RX_EXPECT_NO_PAD: rc = do_tls_getsockopt_no_pad(sk, optval, optlen); break; default: rc = -ENOPROTOOPT; break; } release_sock(sk); return rc; } static int tls_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct tls_context *ctx = tls_get_ctx(sk); if (level != SOL_TLS) return ctx->sk_proto->getsockopt(sk, level, optname, optval, optlen); return do_tls_getsockopt(sk, optname, optval, optlen); } static int validate_crypto_info(const struct tls_crypto_info *crypto_info, const struct tls_crypto_info *alt_crypto_info) { if (crypto_info->version != TLS_1_2_VERSION && crypto_info->version != TLS_1_3_VERSION) return -EINVAL; switch (crypto_info->cipher_type) { case TLS_CIPHER_ARIA_GCM_128: case TLS_CIPHER_ARIA_GCM_256: if (crypto_info->version != TLS_1_2_VERSION) return -EINVAL; break; } /* Ensure that TLS version and ciphers are same in both directions */ if (TLS_CRYPTO_INFO_READY(alt_crypto_info)) { if (alt_crypto_info->version != crypto_info->version || alt_crypto_info->cipher_type != crypto_info->cipher_type) return -EINVAL; } return 0; } static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, unsigned int optlen, int tx) { struct tls_crypto_info *crypto_info, *alt_crypto_info; struct tls_crypto_info *old_crypto_info = NULL; struct tls_context *ctx = tls_get_ctx(sk); const struct tls_cipher_desc *cipher_desc; union tls_crypto_context *crypto_ctx; union tls_crypto_context tmp = {}; bool update = false; int rc = 0; int conf; if (sockptr_is_null(optval) || (optlen < sizeof(*crypto_info))) return -EINVAL; if (tx) { crypto_ctx = &ctx->crypto_send; alt_crypto_info = &ctx->crypto_recv.info; } else { crypto_ctx = &ctx->crypto_recv; alt_crypto_info = &ctx->crypto_send.info; } crypto_info = &crypto_ctx->info; if (TLS_CRYPTO_INFO_READY(crypto_info)) { /* Currently we only support setting crypto info more * than one time for TLS 1.3 */ if (crypto_info->version != TLS_1_3_VERSION) { TLS_INC_STATS(sock_net(sk), tx ? LINUX_MIB_TLSTXREKEYERROR : LINUX_MIB_TLSRXREKEYERROR); return -EBUSY; } update = true; old_crypto_info = crypto_info; crypto_info = &tmp.info; crypto_ctx = &tmp; } rc = copy_from_sockptr(crypto_info, optval, sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto err_crypto_info; } if (update) { /* Ensure that TLS version and ciphers are not modified */ if (crypto_info->version != old_crypto_info->version || crypto_info->cipher_type != old_crypto_info->cipher_type) rc = -EINVAL; } else { rc = validate_crypto_info(crypto_info, alt_crypto_info); } if (rc) goto err_crypto_info; cipher_desc = get_cipher_desc(crypto_info->cipher_type); if (!cipher_desc) { rc = -EINVAL; goto err_crypto_info; } if (optlen != cipher_desc->crypto_info) { rc = -EINVAL; goto err_crypto_info; } rc = copy_from_sockptr_offset(crypto_info + 1, optval, sizeof(*crypto_info), optlen - sizeof(*crypto_info)); if (rc) { rc = -EFAULT; goto err_crypto_info; } if (tx) { rc = tls_set_device_offload(sk); conf = TLS_HW; if (!rc) { TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXDEVICE); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXDEVICE); } else { rc = tls_set_sw_offload(sk, 1, update ? crypto_info : NULL); if (rc) goto err_crypto_info; if (update) { TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXREKEYOK); } else { TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSTXSW); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRTXSW); } conf = TLS_SW; } } else { rc = tls_set_device_offload_rx(sk, ctx); conf = TLS_HW; if (!rc) { TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXDEVICE); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXDEVICE); } else { rc = tls_set_sw_offload(sk, 0, update ? crypto_info : NULL); if (rc) goto err_crypto_info; if (update) { TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXREKEYOK); } else { TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSRXSW); TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSCURRRXSW); } conf = TLS_SW; } if (!update) tls_sw_strparser_arm(sk, ctx); } if (tx) ctx->tx_conf = conf; else ctx->rx_conf = conf; update_sk_prot(sk, ctx); if (update) return 0; if (tx) { ctx->sk_write_space = sk->sk_write_space; sk->sk_write_space = tls_write_space; } else { struct tls_sw_context_rx *rx_ctx = tls_sw_ctx_rx(ctx); tls_strp_check_rcv(&rx_ctx->strp); } return 0; err_crypto_info: if (update) { TLS_INC_STATS(sock_net(sk), tx ? LINUX_MIB_TLSTXREKEYERROR : LINUX_MIB_TLSRXREKEYERROR); } memzero_explicit(crypto_ctx, sizeof(*crypto_ctx)); return rc; } static int do_tls_setsockopt_tx_zc(struct sock *sk, sockptr_t optval, unsigned int optlen) { struct tls_context *ctx = tls_get_ctx(sk); unsigned int value; if (sockptr_is_null(optval) || optlen != sizeof(value)) return -EINVAL; if (copy_from_sockptr(&value, optval, sizeof(value))) return -EFAULT; if (value > 1) return -EINVAL; ctx->zerocopy_sendfile = value; return 0; } static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval, unsigned int optlen) { struct tls_context *ctx = tls_get_ctx(sk); u32 val; int rc; if (ctx->prot_info.version != TLS_1_3_VERSION || sockptr_is_null(optval) || optlen < sizeof(val)) return -EINVAL; rc = copy_from_sockptr(&val, optval, sizeof(val)); if (rc) return -EFAULT; if (val > 1) return -EINVAL; rc = check_zeroed_sockptr(optval, sizeof(val), optlen - sizeof(val)); if (rc < 1) return rc == 0 ? -EINVAL : rc; lock_sock(sk); rc = -EINVAL; if (ctx->rx_conf == TLS_SW || ctx->rx_conf == TLS_HW) { ctx->rx_no_pad = val; tls_update_rx_zc_capable(ctx); rc = 0; } release_sock(sk); return rc; } static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { int rc = 0; switch (optname) { case TLS_TX: case TLS_RX: lock_sock(sk); rc = do_tls_setsockopt_conf(sk, optval, optlen, optname == TLS_TX); release_sock(sk); break; case TLS_TX_ZEROCOPY_RO: lock_sock(sk); rc = do_tls_setsockopt_tx_zc(sk, optval, optlen); release_sock(sk); break; case TLS_RX_EXPECT_NO_PAD: rc = do_tls_setsockopt_no_pad(sk, optval, optlen); break; default: rc = -ENOPROTOOPT; break; } return rc; } static int tls_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct tls_context *ctx = tls_get_ctx(sk); if (level != SOL_TLS) return ctx->sk_proto->setsockopt(sk, level, optname, optval, optlen); return do_tls_setsockopt(sk, optname, optval, optlen); } static int tls_disconnect(struct sock *sk, int flags) { return -EOPNOTSUPP; } struct tls_context *tls_ctx_create(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tls_context *ctx; ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); if (!ctx) return NULL; mutex_init(&ctx->tx_lock); ctx->sk_proto = READ_ONCE(sk->sk_prot); ctx->sk = sk; /* Release semantic of rcu_assign_pointer() ensures that * ctx->sk_proto is visible before changing sk->sk_prot in * update_sk_prot(), and prevents reading uninitialized value in * tls_{getsockopt, setsockopt}. Note that we do not need a * read barrier in tls_{getsockopt,setsockopt} as there is an * address dependency between sk->sk_proto->{getsockopt,setsockopt} * and ctx->sk_proto. */ rcu_assign_pointer(icsk->icsk_ulp_data, ctx); return ctx; } static void build_proto_ops(struct proto_ops ops[TLS_NUM_CONFIG][TLS_NUM_CONFIG], const struct proto_ops *base) { ops[TLS_BASE][TLS_BASE] = *base; ops[TLS_SW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; ops[TLS_SW ][TLS_BASE].splice_eof = tls_sw_splice_eof; ops[TLS_BASE][TLS_SW ] = ops[TLS_BASE][TLS_BASE]; ops[TLS_BASE][TLS_SW ].splice_read = tls_sw_splice_read; ops[TLS_BASE][TLS_SW ].poll = tls_sk_poll; ops[TLS_BASE][TLS_SW ].read_sock = tls_sw_read_sock; ops[TLS_SW ][TLS_SW ] = ops[TLS_SW ][TLS_BASE]; ops[TLS_SW ][TLS_SW ].splice_read = tls_sw_splice_read; ops[TLS_SW ][TLS_SW ].poll = tls_sk_poll; ops[TLS_SW ][TLS_SW ].read_sock = tls_sw_read_sock; #ifdef CONFIG_TLS_DEVICE ops[TLS_HW ][TLS_BASE] = ops[TLS_BASE][TLS_BASE]; ops[TLS_HW ][TLS_SW ] = ops[TLS_BASE][TLS_SW ]; ops[TLS_BASE][TLS_HW ] = ops[TLS_BASE][TLS_SW ]; ops[TLS_SW ][TLS_HW ] = ops[TLS_SW ][TLS_SW ]; ops[TLS_HW ][TLS_HW ] = ops[TLS_HW ][TLS_SW ]; #endif #ifdef CONFIG_TLS_TOE ops[TLS_HW_RECORD][TLS_HW_RECORD] = *base; #endif } static void tls_build_proto(struct sock *sk) { int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4; struct proto *prot = READ_ONCE(sk->sk_prot); /* Build IPv6 TLS whenever the address of tcpv6 _prot changes */ if (ip_ver == TLSV6 && unlikely(prot != smp_load_acquire(&saved_tcpv6_prot))) { mutex_lock(&tcpv6_prot_mutex); if (likely(prot != saved_tcpv6_prot)) { build_protos(tls_prots[TLSV6], prot); build_proto_ops(tls_proto_ops[TLSV6], sk->sk_socket->ops); smp_store_release(&saved_tcpv6_prot, prot); } mutex_unlock(&tcpv6_prot_mutex); } if (ip_ver == TLSV4 && unlikely(prot != smp_load_acquire(&saved_tcpv4_prot))) { mutex_lock(&tcpv4_prot_mutex); if (likely(prot != saved_tcpv4_prot)) { build_protos(tls_prots[TLSV4], prot); build_proto_ops(tls_proto_ops[TLSV4], sk->sk_socket->ops); smp_store_release(&saved_tcpv4_prot, prot); } mutex_unlock(&tcpv4_prot_mutex); } } static void build_protos(struct proto prot[TLS_NUM_CONFIG][TLS_NUM_CONFIG], const struct proto *base) { prot[TLS_BASE][TLS_BASE] = *base; prot[TLS_BASE][TLS_BASE].setsockopt = tls_setsockopt; prot[TLS_BASE][TLS_BASE].getsockopt = tls_getsockopt; prot[TLS_BASE][TLS_BASE].disconnect = tls_disconnect; prot[TLS_BASE][TLS_BASE].close = tls_sk_proto_close; prot[TLS_SW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; prot[TLS_SW][TLS_BASE].sendmsg = tls_sw_sendmsg; prot[TLS_SW][TLS_BASE].splice_eof = tls_sw_splice_eof; prot[TLS_BASE][TLS_SW] = prot[TLS_BASE][TLS_BASE]; prot[TLS_BASE][TLS_SW].recvmsg = tls_sw_recvmsg; prot[TLS_BASE][TLS_SW].sock_is_readable = tls_sw_sock_is_readable; prot[TLS_BASE][TLS_SW].close = tls_sk_proto_close; prot[TLS_SW][TLS_SW] = prot[TLS_SW][TLS_BASE]; prot[TLS_SW][TLS_SW].recvmsg = tls_sw_recvmsg; prot[TLS_SW][TLS_SW].sock_is_readable = tls_sw_sock_is_readable; prot[TLS_SW][TLS_SW].close = tls_sk_proto_close; #ifdef CONFIG_TLS_DEVICE prot[TLS_HW][TLS_BASE] = prot[TLS_BASE][TLS_BASE]; prot[TLS_HW][TLS_BASE].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_BASE].splice_eof = tls_device_splice_eof; prot[TLS_HW][TLS_SW] = prot[TLS_BASE][TLS_SW]; prot[TLS_HW][TLS_SW].sendmsg = tls_device_sendmsg; prot[TLS_HW][TLS_SW].splice_eof = tls_device_splice_eof; prot[TLS_BASE][TLS_HW] = prot[TLS_BASE][TLS_SW]; prot[TLS_SW][TLS_HW] = prot[TLS_SW][TLS_SW]; prot[TLS_HW][TLS_HW] = prot[TLS_HW][TLS_SW]; #endif #ifdef CONFIG_TLS_TOE prot[TLS_HW_RECORD][TLS_HW_RECORD] = *base; prot[TLS_HW_RECORD][TLS_HW_RECORD].hash = tls_toe_hash; prot[TLS_HW_RECORD][TLS_HW_RECORD].unhash = tls_toe_unhash; #endif } static int tls_init(struct sock *sk) { struct tls_context *ctx; int rc = 0; tls_build_proto(sk); #ifdef CONFIG_TLS_TOE if (tls_toe_bypass(sk)) return 0; #endif /* The TLS ulp is currently supported only for TCP sockets * in ESTABLISHED state. * Supporting sockets in LISTEN state will require us * to modify the accept implementation to clone rather then * share the ulp context. */ if (sk->sk_state != TCP_ESTABLISHED) return -ENOTCONN; /* allocate tls context */ write_lock_bh(&sk->sk_callback_lock); ctx = tls_ctx_create(sk); if (!ctx) { rc = -ENOMEM; goto out; } ctx->tx_conf = TLS_BASE; ctx->rx_conf = TLS_BASE; update_sk_prot(sk, ctx); out: write_unlock_bh(&sk->sk_callback_lock); return rc; } static void tls_update(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)) { struct tls_context *ctx; WARN_ON_ONCE(sk->sk_prot == p); ctx = tls_get_ctx(sk); if (likely(ctx)) { ctx->sk_write_space = write_space; ctx->sk_proto = p; } else { /* Pairs with lockless read in sk_clone_lock(). */ WRITE_ONCE(sk->sk_prot, p); sk->sk_write_space = write_space; } } static u16 tls_user_config(struct tls_context *ctx, bool tx) { u16 config = tx ? ctx->tx_conf : ctx->rx_conf; switch (config) { case TLS_BASE: return TLS_CONF_BASE; case TLS_SW: return TLS_CONF_SW; case TLS_HW: return TLS_CONF_HW; case TLS_HW_RECORD: return TLS_CONF_HW_RECORD; } return 0; } static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin) { u16 version, cipher_type; struct tls_context *ctx; struct nlattr *start; int err; start = nla_nest_start_noflag(skb, INET_ULP_INFO_TLS); if (!start) return -EMSGSIZE; rcu_read_lock(); ctx = rcu_dereference(inet_csk(sk)->icsk_ulp_data); if (!ctx) { err = 0; goto nla_failure; } version = ctx->prot_info.version; if (version) { err = nla_put_u16(skb, TLS_INFO_VERSION, version); if (err) goto nla_failure; } cipher_type = ctx->prot_info.cipher_type; if (cipher_type) { err = nla_put_u16(skb, TLS_INFO_CIPHER, cipher_type); if (err) goto nla_failure; } err = nla_put_u16(skb, TLS_INFO_TXCONF, tls_user_config(ctx, true)); if (err) goto nla_failure; err = nla_put_u16(skb, TLS_INFO_RXCONF, tls_user_config(ctx, false)); if (err) goto nla_failure; if (ctx->tx_conf == TLS_HW && ctx->zerocopy_sendfile) { err = nla_put_flag(skb, TLS_INFO_ZC_RO_TX); if (err) goto nla_failure; } if (ctx->rx_no_pad) { err = nla_put_flag(skb, TLS_INFO_RX_NO_PAD); if (err) goto nla_failure; } rcu_read_unlock(); nla_nest_end(skb, start); return 0; nla_failure: rcu_read_unlock(); nla_nest_cancel(skb, start); return err; } static size_t tls_get_info_size(const struct sock *sk, bool net_admin) { size_t size = 0; size += nla_total_size(0) + /* INET_ULP_INFO_TLS */ nla_total_size(sizeof(u16)) + /* TLS_INFO_VERSION */ nla_total_size(sizeof(u16)) + /* TLS_INFO_CIPHER */ nla_total_size(sizeof(u16)) + /* TLS_INFO_RXCONF */ nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */ nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */ nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */ 0; return size; } static int __net_init tls_init_net(struct net *net) { int err; net->mib.tls_statistics = alloc_percpu(struct linux_tls_mib); if (!net->mib.tls_statistics) return -ENOMEM; err = tls_proc_init(net); if (err) goto err_free_stats; return 0; err_free_stats: free_percpu(net->mib.tls_statistics); return err; } static void __net_exit tls_exit_net(struct net *net) { tls_proc_fini(net); free_percpu(net->mib.tls_statistics); } static struct pernet_operations tls_proc_ops = { .init = tls_init_net, .exit = tls_exit_net, }; static struct tcp_ulp_ops tcp_tls_ulp_ops __read_mostly = { .name = "tls", .owner = THIS_MODULE, .init = tls_init, .update = tls_update, .get_info = tls_get_info, .get_info_size = tls_get_info_size, }; static int __init tls_register(void) { int err; err = register_pernet_subsys(&tls_proc_ops); if (err) return err; err = tls_strp_dev_init(); if (err) goto err_pernet; err = tls_device_init(); if (err) goto err_strp; tcp_register_ulp(&tcp_tls_ulp_ops); return 0; err_strp: tls_strp_dev_exit(); err_pernet: unregister_pernet_subsys(&tls_proc_ops); return err; } static void __exit tls_unregister(void) { tcp_unregister_ulp(&tcp_tls_ulp_ops); tls_strp_dev_exit(); tls_device_cleanup(); unregister_pernet_subsys(&tls_proc_ops); } module_init(tls_register); module_exit(tls_unregister);
64 36 22 7 24 45 23 12 1 15 3 3 1 12 12 12 9 2 2 1 19 19 1 1 42 25 12 9 2 1 81 81 77 12 65 42 3 1 3 3 47 1 47 46 1 2 2 72 72 2 72 72 72 9 1 4 1 1 1 1 1 11 11 11 11 11 11 16 18 18 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 // SPDX-License-Identifier: GPL-2.0-only /* * Minimal file system backend for holding eBPF maps and programs, * used by bpf(2) object pinning. * * Authors: * * Daniel Borkmann <daniel@iogearbox.net> */ #include <linux/init.h> #include <linux/magic.h> #include <linux/major.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/kdev_t.h> #include <linux/filter.h> #include <linux/bpf.h> #include <linux/bpf_trace.h> #include <linux/kstrtox.h> #include "preload/bpf_preload.h" enum bpf_type { BPF_TYPE_UNSPEC = 0, BPF_TYPE_PROG, BPF_TYPE_MAP, BPF_TYPE_LINK, }; static void *bpf_any_get(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: bpf_prog_inc(raw); break; case BPF_TYPE_MAP: bpf_map_inc_with_uref(raw); break; case BPF_TYPE_LINK: bpf_link_inc(raw); break; default: WARN_ON_ONCE(1); break; } return raw; } static void bpf_any_put(void *raw, enum bpf_type type) { switch (type) { case BPF_TYPE_PROG: bpf_prog_put(raw); break; case BPF_TYPE_MAP: bpf_map_put_with_uref(raw); break; case BPF_TYPE_LINK: bpf_link_put(raw); break; default: WARN_ON_ONCE(1); break; } } static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) { void *raw; raw = bpf_map_get_with_uref(ufd); if (!IS_ERR(raw)) { *type = BPF_TYPE_MAP; return raw; } raw = bpf_prog_get(ufd); if (!IS_ERR(raw)) { *type = BPF_TYPE_PROG; return raw; } raw = bpf_link_get_from_fd(ufd); if (!IS_ERR(raw)) { *type = BPF_TYPE_LINK; return raw; } return ERR_PTR(-EINVAL); } static const struct inode_operations bpf_dir_iops; static const struct inode_operations bpf_prog_iops = { }; static const struct inode_operations bpf_map_iops = { }; static const struct inode_operations bpf_link_iops = { }; struct inode *bpf_get_inode(struct super_block *sb, const struct inode *dir, umode_t mode) { struct inode *inode; switch (mode & S_IFMT) { case S_IFDIR: case S_IFREG: case S_IFLNK: break; default: return ERR_PTR(-EINVAL); } inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOSPC); inode->i_ino = get_next_ino(); simple_inode_init_ts(inode); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); return inode; } static int bpf_inode_type(const struct inode *inode, enum bpf_type *type) { *type = BPF_TYPE_UNSPEC; if (inode->i_op == &bpf_prog_iops) *type = BPF_TYPE_PROG; else if (inode->i_op == &bpf_map_iops) *type = BPF_TYPE_MAP; else if (inode->i_op == &bpf_link_iops) *type = BPF_TYPE_LINK; else return -EACCES; return 0; } static void bpf_dentry_finalize(struct dentry *dentry, struct inode *inode, struct inode *dir) { d_instantiate(dentry, inode); dget(dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } static struct dentry *bpf_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFDIR); if (IS_ERR(inode)) return ERR_CAST(inode); inode->i_op = &bpf_dir_iops; inode->i_fop = &simple_dir_operations; inc_nlink(inode); inc_nlink(dir); bpf_dentry_finalize(dentry, inode, dir); return NULL; } struct map_iter { void *key; bool done; }; static struct map_iter *map_iter(struct seq_file *m) { return m->private; } static struct bpf_map *seq_file_to_map(struct seq_file *m) { return file_inode(m->file)->i_private; } static void map_iter_free(struct map_iter *iter) { if (iter) { kfree(iter->key); kfree(iter); } } static struct map_iter *map_iter_alloc(struct bpf_map *map) { struct map_iter *iter; iter = kzalloc(sizeof(*iter), GFP_KERNEL | __GFP_NOWARN); if (!iter) goto error; iter->key = kzalloc(map->key_size, GFP_KERNEL | __GFP_NOWARN); if (!iter->key) goto error; return iter; error: map_iter_free(iter); return NULL; } static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) { struct bpf_map *map = seq_file_to_map(m); void *key = map_iter(m)->key; void *prev_key; (*pos)++; if (map_iter(m)->done) return NULL; if (unlikely(v == SEQ_START_TOKEN)) prev_key = NULL; else prev_key = key; rcu_read_lock(); if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; key = NULL; } rcu_read_unlock(); return key; } static void *map_seq_start(struct seq_file *m, loff_t *pos) { if (map_iter(m)->done) return NULL; return *pos ? map_iter(m)->key : SEQ_START_TOKEN; } static void map_seq_stop(struct seq_file *m, void *v) { } static int map_seq_show(struct seq_file *m, void *v) { struct bpf_map *map = seq_file_to_map(m); void *key = map_iter(m)->key; if (unlikely(v == SEQ_START_TOKEN)) { seq_puts(m, "# WARNING!! The output is for debug purpose only\n"); seq_puts(m, "# WARNING!! The output format will change\n"); } else { map->ops->map_seq_show_elem(map, key, m); } return 0; } static const struct seq_operations bpffs_map_seq_ops = { .start = map_seq_start, .next = map_seq_next, .show = map_seq_show, .stop = map_seq_stop, }; static int bpffs_map_open(struct inode *inode, struct file *file) { struct bpf_map *map = inode->i_private; struct map_iter *iter; struct seq_file *m; int err; iter = map_iter_alloc(map); if (!iter) return -ENOMEM; err = seq_open(file, &bpffs_map_seq_ops); if (err) { map_iter_free(iter); return err; } m = file->private_data; m->private = iter; return 0; } static int bpffs_map_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; map_iter_free(map_iter(m)); return seq_release(inode, file); } /* bpffs_map_fops should only implement the basic * read operation for a BPF map. The purpose is to * provide a simple user intuitive way to do * "cat bpffs/pathto/a-pinned-map". * * Other operations (e.g. write, lookup...) should be realized by * the userspace tools (e.g. bpftool) through the * BPF_OBJ_GET_INFO_BY_FD and the map's lookup/update * interface. */ static const struct file_operations bpffs_map_fops = { .open = bpffs_map_open, .read = seq_read, .release = bpffs_map_release, }; static int bpffs_obj_open(struct inode *inode, struct file *file) { return -EIO; } static const struct file_operations bpffs_obj_fops = { .open = bpffs_obj_open, }; static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, const struct inode_operations *iops, const struct file_operations *fops) { struct inode *dir = dentry->d_parent->d_inode; struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = iops; inode->i_fop = fops; inode->i_private = raw; bpf_dentry_finalize(dentry, inode, dir); return 0; } static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) { return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops, &bpffs_obj_fops); } static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) { struct bpf_map *map = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops, bpf_map_support_seq_show(map) ? &bpffs_map_fops : &bpffs_obj_fops); } static int bpf_mklink(struct dentry *dentry, umode_t mode, void *arg) { struct bpf_link *link = arg; return bpf_mkobj_ops(dentry, mode, arg, &bpf_link_iops, bpf_link_is_iter(link) ? &bpf_iter_fops : &bpffs_obj_fops); } static struct dentry * bpf_lookup(struct inode *dir, struct dentry *dentry, unsigned flags) { /* Dots in names (e.g. "/sys/fs/bpf/foo.bar") are reserved for future * extensions. That allows popoulate_bpffs() create special files. */ if ((dir->i_mode & S_IALLUGO) && strchr(dentry->d_name.name, '.')) return ERR_PTR(-EPERM); return simple_lookup(dir, dentry, flags); } static int bpf_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *target) { char *link = kstrdup(target, GFP_USER | __GFP_NOWARN); struct inode *inode; if (!link) return -ENOMEM; inode = bpf_get_inode(dir->i_sb, dir, S_IRWXUGO | S_IFLNK); if (IS_ERR(inode)) { kfree(link); return PTR_ERR(inode); } inode->i_op = &simple_symlink_inode_operations; inode->i_link = link; bpf_dentry_finalize(dentry, inode, dir); return 0; } static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, .mkdir = bpf_mkdir, .symlink = bpf_symlink, .rmdir = simple_rmdir, .rename = simple_rename, .link = simple_link, .unlink = simple_unlink, }; /* pin iterator link into bpffs */ static int bpf_iter_link_pin_kernel(struct dentry *parent, const char *name, struct bpf_link *link) { umode_t mode = S_IFREG | S_IRUSR; struct dentry *dentry; int ret; inode_lock(parent->d_inode); dentry = lookup_noperm(&QSTR(name), parent); if (IS_ERR(dentry)) { inode_unlock(parent->d_inode); return PTR_ERR(dentry); } ret = bpf_mkobj_ops(dentry, mode, link, &bpf_link_iops, &bpf_iter_fops); dput(dentry); inode_unlock(parent->d_inode); return ret; } static int bpf_obj_do_pin(int path_fd, const char __user *pathname, void *raw, enum bpf_type type) { struct dentry *dentry; struct inode *dir; struct path path; umode_t mode; int ret; dentry = start_creating_user_path(path_fd, pathname, &path, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); dir = d_inode(path.dentry); if (dir->i_op != &bpf_dir_iops) { ret = -EPERM; goto out; } mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); ret = security_path_mknod(&path, dentry, mode, 0); if (ret) goto out; switch (type) { case BPF_TYPE_PROG: ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); break; case BPF_TYPE_MAP: ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); break; case BPF_TYPE_LINK: ret = vfs_mkobj(dentry, mode, bpf_mklink, raw); break; default: ret = -EPERM; } out: end_creating_path(&path, dentry); return ret; } int bpf_obj_pin_user(u32 ufd, int path_fd, const char __user *pathname) { enum bpf_type type; void *raw; int ret; raw = bpf_fd_probe_obj(ufd, &type); if (IS_ERR(raw)) return PTR_ERR(raw); ret = bpf_obj_do_pin(path_fd, pathname, raw, type); if (ret != 0) bpf_any_put(raw, type); return ret; } static void *bpf_obj_do_get(int path_fd, const char __user *pathname, enum bpf_type *type, int flags) { struct inode *inode; struct path path; void *raw; int ret; ret = user_path_at(path_fd, pathname, LOOKUP_FOLLOW, &path); if (ret) return ERR_PTR(ret); inode = d_backing_inode(path.dentry); ret = path_permission(&path, ACC_MODE(flags)); if (ret) goto out; ret = bpf_inode_type(inode, type); if (ret) goto out; raw = bpf_any_get(inode->i_private, *type); if (!IS_ERR(raw)) touch_atime(&path); path_put(&path); return raw; out: path_put(&path); return ERR_PTR(ret); } int bpf_obj_get_user(int path_fd, const char __user *pathname, int flags) { enum bpf_type type = BPF_TYPE_UNSPEC; int f_flags; void *raw; int ret; f_flags = bpf_get_file_flag(flags); if (f_flags < 0) return f_flags; raw = bpf_obj_do_get(path_fd, pathname, &type, f_flags); if (IS_ERR(raw)) return PTR_ERR(raw); if (type == BPF_TYPE_PROG) ret = bpf_prog_new_fd(raw); else if (type == BPF_TYPE_MAP) ret = bpf_map_new_fd(raw, f_flags); else if (type == BPF_TYPE_LINK) ret = (f_flags != O_RDWR) ? -EINVAL : bpf_link_new_fd(raw); else return -ENOENT; if (ret < 0) bpf_any_put(raw, type); return ret; } static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) { struct bpf_prog *prog; int ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ); if (ret) return ERR_PTR(ret); if (inode->i_op == &bpf_map_iops) return ERR_PTR(-EINVAL); if (inode->i_op == &bpf_link_iops) return ERR_PTR(-EINVAL); if (inode->i_op != &bpf_prog_iops) return ERR_PTR(-EACCES); prog = inode->i_private; ret = security_bpf_prog(prog); if (ret < 0) return ERR_PTR(ret); if (!bpf_prog_get_ok(prog, &type, false)) return ERR_PTR(-EINVAL); bpf_prog_inc(prog); return prog; } struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) { struct bpf_prog *prog; struct path path; int ret = kern_path(name, LOOKUP_FOLLOW, &path); if (ret) return ERR_PTR(ret); prog = __get_prog_inode(d_backing_inode(path.dentry), type); if (!IS_ERR(prog)) touch_atime(&path); path_put(&path); return prog; } EXPORT_SYMBOL(bpf_prog_get_type_path); struct bpffs_btf_enums { const struct btf *btf; const struct btf_type *cmd_t; const struct btf_type *map_t; const struct btf_type *prog_t; const struct btf_type *attach_t; }; static int find_bpffs_btf_enums(struct bpffs_btf_enums *info) { const struct btf *btf; const struct btf_type *t; const char *name; int i, n; memset(info, 0, sizeof(*info)); btf = bpf_get_btf_vmlinux(); if (IS_ERR(btf)) return PTR_ERR(btf); if (!btf) return -ENOENT; info->btf = btf; for (i = 1, n = btf_nr_types(btf); i < n; i++) { t = btf_type_by_id(btf, i); if (!btf_type_is_enum(t)) continue; name = btf_name_by_offset(btf, t->name_off); if (!name) continue; if (strcmp(name, "bpf_cmd") == 0) info->cmd_t = t; else if (strcmp(name, "bpf_map_type") == 0) info->map_t = t; else if (strcmp(name, "bpf_prog_type") == 0) info->prog_t = t; else if (strcmp(name, "bpf_attach_type") == 0) info->attach_t = t; else continue; if (info->cmd_t && info->map_t && info->prog_t && info->attach_t) return 0; } return -ESRCH; } static bool find_btf_enum_const(const struct btf *btf, const struct btf_type *enum_t, const char *prefix, const char *str, int *value) { const struct btf_enum *e; const char *name; int i, n, pfx_len = strlen(prefix); *value = 0; if (!btf || !enum_t) return false; for (i = 0, n = btf_vlen(enum_t); i < n; i++) { e = &btf_enum(enum_t)[i]; name = btf_name_by_offset(btf, e->name_off); if (!name || strncasecmp(name, prefix, pfx_len) != 0) continue; /* match symbolic name case insensitive and ignoring prefix */ if (strcasecmp(name + pfx_len, str) == 0) { *value = e->val; return true; } } return false; } static void seq_print_delegate_opts(struct seq_file *m, const char *opt_name, const struct btf *btf, const struct btf_type *enum_t, const char *prefix, u64 delegate_msk, u64 any_msk) { const struct btf_enum *e; bool first = true; const char *name; u64 msk; int i, n, pfx_len = strlen(prefix); delegate_msk &= any_msk; /* clear unknown bits */ if (delegate_msk == 0) return; seq_printf(m, ",%s", opt_name); if (delegate_msk == any_msk) { seq_printf(m, "=any"); return; } if (btf && enum_t) { for (i = 0, n = btf_vlen(enum_t); i < n; i++) { e = &btf_enum(enum_t)[i]; name = btf_name_by_offset(btf, e->name_off); if (!name || strncasecmp(name, prefix, pfx_len) != 0) continue; msk = 1ULL << e->val; if (delegate_msk & msk) { /* emit lower-case name without prefix */ seq_putc(m, first ? '=' : ':'); name += pfx_len; while (*name) { seq_putc(m, tolower(*name)); name++; } delegate_msk &= ~msk; first = false; } } } if (delegate_msk) seq_printf(m, "%c0x%llx", first ? '=' : ':', delegate_msk); } /* * Display the mount options in /proc/mounts. */ static int bpf_show_options(struct seq_file *m, struct dentry *root) { struct inode *inode = d_inode(root); umode_t mode = inode->i_mode & S_IALLUGO & ~S_ISVTX; struct bpf_mount_opts *opts = root->d_sb->s_fs_info; u64 mask; if (!uid_eq(inode->i_uid, GLOBAL_ROOT_UID)) seq_printf(m, ",uid=%u", from_kuid_munged(&init_user_ns, inode->i_uid)); if (!gid_eq(inode->i_gid, GLOBAL_ROOT_GID)) seq_printf(m, ",gid=%u", from_kgid_munged(&init_user_ns, inode->i_gid)); if (mode != S_IRWXUGO) seq_printf(m, ",mode=%o", mode); if (opts->delegate_cmds || opts->delegate_maps || opts->delegate_progs || opts->delegate_attachs) { struct bpffs_btf_enums info; /* ignore errors, fallback to hex */ (void)find_bpffs_btf_enums(&info); mask = (1ULL << __MAX_BPF_CMD) - 1; seq_print_delegate_opts(m, "delegate_cmds", info.btf, info.cmd_t, "BPF_", opts->delegate_cmds, mask); mask = (1ULL << __MAX_BPF_MAP_TYPE) - 1; seq_print_delegate_opts(m, "delegate_maps", info.btf, info.map_t, "BPF_MAP_TYPE_", opts->delegate_maps, mask); mask = (1ULL << __MAX_BPF_PROG_TYPE) - 1; seq_print_delegate_opts(m, "delegate_progs", info.btf, info.prog_t, "BPF_PROG_TYPE_", opts->delegate_progs, mask); mask = (1ULL << __MAX_BPF_ATTACH_TYPE) - 1; seq_print_delegate_opts(m, "delegate_attachs", info.btf, info.attach_t, "BPF_", opts->delegate_attachs, mask); } return 0; } static void bpf_free_inode(struct inode *inode) { enum bpf_type type; if (S_ISLNK(inode->i_mode)) kfree(inode->i_link); if (!bpf_inode_type(inode, &type)) bpf_any_put(inode->i_private, type); free_inode_nonrcu(inode); } const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = inode_just_drop, .show_options = bpf_show_options, .free_inode = bpf_free_inode, }; enum { OPT_UID, OPT_GID, OPT_MODE, OPT_DELEGATE_CMDS, OPT_DELEGATE_MAPS, OPT_DELEGATE_PROGS, OPT_DELEGATE_ATTACHS, }; static const struct fs_parameter_spec bpf_fs_parameters[] = { fsparam_u32 ("uid", OPT_UID), fsparam_u32 ("gid", OPT_GID), fsparam_u32oct ("mode", OPT_MODE), fsparam_string ("delegate_cmds", OPT_DELEGATE_CMDS), fsparam_string ("delegate_maps", OPT_DELEGATE_MAPS), fsparam_string ("delegate_progs", OPT_DELEGATE_PROGS), fsparam_string ("delegate_attachs", OPT_DELEGATE_ATTACHS), {} }; static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct bpf_mount_opts *opts = fc->s_fs_info; struct fs_parse_result result; kuid_t uid; kgid_t gid; int opt, err; opt = fs_parse(fc, bpf_fs_parameters, param, &result); if (opt < 0) { /* We might like to report bad mount options here, but * traditionally we've ignored all mount options, so we'd * better continue to ignore non-existing options for bpf. */ if (opt == -ENOPARAM) { opt = vfs_parse_fs_param_source(fc, param); if (opt != -ENOPARAM) return opt; return 0; } if (opt < 0) return opt; } switch (opt) { case OPT_UID: uid = make_kuid(current_user_ns(), result.uint_32); if (!uid_valid(uid)) goto bad_value; /* * The requested uid must be representable in the * filesystem's idmapping. */ if (!kuid_has_mapping(fc->user_ns, uid)) goto bad_value; opts->uid = uid; break; case OPT_GID: gid = make_kgid(current_user_ns(), result.uint_32); if (!gid_valid(gid)) goto bad_value; /* * The requested gid must be representable in the * filesystem's idmapping. */ if (!kgid_has_mapping(fc->user_ns, gid)) goto bad_value; opts->gid = gid; break; case OPT_MODE: opts->mode = result.uint_32 & S_IALLUGO; break; case OPT_DELEGATE_CMDS: case OPT_DELEGATE_MAPS: case OPT_DELEGATE_PROGS: case OPT_DELEGATE_ATTACHS: { struct bpffs_btf_enums info; const struct btf_type *enum_t; const char *enum_pfx; u64 *delegate_msk, msk = 0; char *p, *str; int val; /* ignore errors, fallback to hex */ (void)find_bpffs_btf_enums(&info); switch (opt) { case OPT_DELEGATE_CMDS: delegate_msk = &opts->delegate_cmds; enum_t = info.cmd_t; enum_pfx = "BPF_"; break; case OPT_DELEGATE_MAPS: delegate_msk = &opts->delegate_maps; enum_t = info.map_t; enum_pfx = "BPF_MAP_TYPE_"; break; case OPT_DELEGATE_PROGS: delegate_msk = &opts->delegate_progs; enum_t = info.prog_t; enum_pfx = "BPF_PROG_TYPE_"; break; case OPT_DELEGATE_ATTACHS: delegate_msk = &opts->delegate_attachs; enum_t = info.attach_t; enum_pfx = "BPF_"; break; default: return -EINVAL; } str = param->string; while ((p = strsep(&str, ":"))) { if (strcmp(p, "any") == 0) { msk |= ~0ULL; } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { msk |= 1ULL << val; } else { err = kstrtou64(p, 0, &msk); if (err) return err; } } /* Setting delegation mount options requires privileges */ if (msk && !capable(CAP_SYS_ADMIN)) return -EPERM; *delegate_msk |= msk; break; } default: /* ignore unknown mount options */ break; } return 0; bad_value: return invalfc(fc, "Bad value for '%s'", param->key); } struct bpf_preload_ops *bpf_preload_ops; EXPORT_SYMBOL_GPL(bpf_preload_ops); static bool bpf_preload_mod_get(void) { /* If bpf_preload.ko wasn't loaded earlier then load it now. * When bpf_preload is built into vmlinux the module's __init * function will populate it. */ if (!bpf_preload_ops) { request_module("bpf_preload"); if (!bpf_preload_ops) return false; } /* And grab the reference, so the module doesn't disappear while the * kernel is interacting with the kernel module and its UMD. */ if (!try_module_get(bpf_preload_ops->owner)) { pr_err("bpf_preload module get failed.\n"); return false; } return true; } static void bpf_preload_mod_put(void) { if (bpf_preload_ops) /* now user can "rmmod bpf_preload" if necessary */ module_put(bpf_preload_ops->owner); } static DEFINE_MUTEX(bpf_preload_lock); static int populate_bpffs(struct dentry *parent) { struct bpf_preload_info objs[BPF_PRELOAD_LINKS] = {}; int err = 0, i; /* grab the mutex to make sure the kernel interactions with bpf_preload * are serialized */ mutex_lock(&bpf_preload_lock); /* if bpf_preload.ko wasn't built into vmlinux then load it */ if (!bpf_preload_mod_get()) goto out; err = bpf_preload_ops->preload(objs); if (err) goto out_put; for (i = 0; i < BPF_PRELOAD_LINKS; i++) { bpf_link_inc(objs[i].link); err = bpf_iter_link_pin_kernel(parent, objs[i].link_name, objs[i].link); if (err) { bpf_link_put(objs[i].link); goto out_put; } } out_put: bpf_preload_mod_put(); out: mutex_unlock(&bpf_preload_lock); return err; } static int bpf_fill_super(struct super_block *sb, struct fs_context *fc) { static const struct tree_descr bpf_rfiles[] = { { "" } }; struct bpf_mount_opts *opts = sb->s_fs_info; struct inode *inode; int ret; /* Mounting an instance of BPF FS requires privileges */ if (fc->user_ns != &init_user_ns && !capable(CAP_SYS_ADMIN)) return -EPERM; ret = simple_fill_super(sb, BPF_FS_MAGIC, bpf_rfiles); if (ret) return ret; sb->s_op = &bpf_super_ops; inode = sb->s_root->d_inode; inode->i_uid = opts->uid; inode->i_gid = opts->gid; inode->i_op = &bpf_dir_iops; inode->i_mode &= ~S_IALLUGO; populate_bpffs(sb->s_root); inode->i_mode |= S_ISVTX | opts->mode; return 0; } static int bpf_get_tree(struct fs_context *fc) { return get_tree_nodev(fc, bpf_fill_super); } static void bpf_free_fc(struct fs_context *fc) { kfree(fc->s_fs_info); } static const struct fs_context_operations bpf_context_ops = { .free = bpf_free_fc, .parse_param = bpf_parse_param, .get_tree = bpf_get_tree, }; /* * Set up the filesystem mount context. */ static int bpf_init_fs_context(struct fs_context *fc) { struct bpf_mount_opts *opts; opts = kzalloc(sizeof(struct bpf_mount_opts), GFP_KERNEL); if (!opts) return -ENOMEM; opts->mode = S_IRWXUGO; opts->uid = current_fsuid(); opts->gid = current_fsgid(); /* start out with no BPF token delegation enabled */ opts->delegate_cmds = 0; opts->delegate_maps = 0; opts->delegate_progs = 0; opts->delegate_attachs = 0; fc->s_fs_info = opts; fc->ops = &bpf_context_ops; return 0; } static void bpf_kill_super(struct super_block *sb) { struct bpf_mount_opts *opts = sb->s_fs_info; kill_litter_super(sb); kfree(opts); } static struct file_system_type bpf_fs_type = { .owner = THIS_MODULE, .name = "bpf", .init_fs_context = bpf_init_fs_context, .parameters = bpf_fs_parameters, .kill_sb = bpf_kill_super, .fs_flags = FS_USERNS_MOUNT, }; static int __init bpf_init(void) { int ret; ret = sysfs_create_mount_point(fs_kobj, "bpf"); if (ret) return ret; ret = register_filesystem(&bpf_fs_type); if (ret) sysfs_remove_mount_point(fs_kobj, "bpf"); return ret; } fs_initcall(bpf_init);
119 119 1 7 1 2 1 1 1 1 8 2 2 1 2 1 4 1 3 417 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 // SPDX-License-Identifier: GPL-2.0 /* * Management Component Transport Protocol (MCTP) - routing * implementation. * * This is currently based on a simple routing table, with no dst cache. The * number of routes should stay fairly small, so the lookup cost is small. * * Copyright (c) 2021 Code Construct * Copyright (c) 2021 Google */ #include <linux/idr.h> #include <linux/mctp.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> #include <net/mctp.h> #include <net/mctpdevice.h> #include <net/netlink.h> #include <net/sock.h> static int mctp_neigh_add(struct mctp_dev *mdev, mctp_eid_t eid, enum mctp_neigh_source source, size_t lladdr_len, const void *lladdr) { struct net *net = dev_net(mdev->dev); struct mctp_neigh *neigh; int rc; mutex_lock(&net->mctp.neigh_lock); if (mctp_neigh_lookup(mdev, eid, NULL) == 0) { rc = -EEXIST; goto out; } if (lladdr_len > sizeof(neigh->ha)) { rc = -EINVAL; goto out; } neigh = kzalloc(sizeof(*neigh), GFP_KERNEL); if (!neigh) { rc = -ENOMEM; goto out; } INIT_LIST_HEAD(&neigh->list); neigh->dev = mdev; mctp_dev_hold(neigh->dev); neigh->eid = eid; neigh->source = source; memcpy(neigh->ha, lladdr, lladdr_len); list_add_rcu(&neigh->list, &net->mctp.neighbours); rc = 0; out: mutex_unlock(&net->mctp.neigh_lock); return rc; } static void __mctp_neigh_free(struct rcu_head *rcu) { struct mctp_neigh *neigh = container_of(rcu, struct mctp_neigh, rcu); mctp_dev_put(neigh->dev); kfree(neigh); } /* Removes all neighbour entries referring to a device */ void mctp_neigh_remove_dev(struct mctp_dev *mdev) { struct net *net = dev_net(mdev->dev); struct mctp_neigh *neigh, *tmp; mutex_lock(&net->mctp.neigh_lock); list_for_each_entry_safe(neigh, tmp, &net->mctp.neighbours, list) { if (neigh->dev == mdev) { list_del_rcu(&neigh->list); /* TODO: immediate RTM_DELNEIGH */ call_rcu(&neigh->rcu, __mctp_neigh_free); } } mutex_unlock(&net->mctp.neigh_lock); } static int mctp_neigh_remove(struct mctp_dev *mdev, mctp_eid_t eid, enum mctp_neigh_source source) { struct net *net = dev_net(mdev->dev); struct mctp_neigh *neigh, *tmp; bool dropped = false; mutex_lock(&net->mctp.neigh_lock); list_for_each_entry_safe(neigh, tmp, &net->mctp.neighbours, list) { if (neigh->dev == mdev && neigh->eid == eid && neigh->source == source) { list_del_rcu(&neigh->list); /* TODO: immediate RTM_DELNEIGH */ call_rcu(&neigh->rcu, __mctp_neigh_free); dropped = true; } } mutex_unlock(&net->mctp.neigh_lock); return dropped ? 0 : -ENOENT; } static const struct nla_policy nd_mctp_policy[NDA_MAX + 1] = { [NDA_DST] = { .type = NLA_U8 }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, }; static int mctp_rtm_newneigh(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *dev; struct mctp_dev *mdev; struct ndmsg *ndm; struct nlattr *tb[NDA_MAX + 1]; int rc; mctp_eid_t eid; void *lladdr; int lladdr_len; rc = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nd_mctp_policy, extack); if (rc < 0) { NL_SET_ERR_MSG(extack, "lladdr too large?"); return rc; } if (!tb[NDA_DST]) { NL_SET_ERR_MSG(extack, "Neighbour EID must be specified"); return -EINVAL; } if (!tb[NDA_LLADDR]) { NL_SET_ERR_MSG(extack, "Neighbour lladdr must be specified"); return -EINVAL; } eid = nla_get_u8(tb[NDA_DST]); if (!mctp_address_unicast(eid)) { NL_SET_ERR_MSG(extack, "Invalid neighbour EID"); return -EINVAL; } lladdr = nla_data(tb[NDA_LLADDR]); lladdr_len = nla_len(tb[NDA_LLADDR]); ndm = nlmsg_data(nlh); dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) return -ENODEV; mdev = mctp_dev_get_rtnl(dev); if (!mdev) return -ENODEV; if (lladdr_len != dev->addr_len) { NL_SET_ERR_MSG(extack, "Wrong lladdr length"); return -EINVAL; } return mctp_neigh_add(mdev, eid, MCTP_NEIGH_STATIC, lladdr_len, lladdr); } static int mctp_rtm_delneigh(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NDA_MAX + 1]; struct net_device *dev; struct mctp_dev *mdev; struct ndmsg *ndm; int rc; mctp_eid_t eid; rc = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, nd_mctp_policy, extack); if (rc < 0) { NL_SET_ERR_MSG(extack, "incorrect format"); return rc; } if (!tb[NDA_DST]) { NL_SET_ERR_MSG(extack, "Neighbour EID must be specified"); return -EINVAL; } eid = nla_get_u8(tb[NDA_DST]); ndm = nlmsg_data(nlh); dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (!dev) return -ENODEV; mdev = mctp_dev_get_rtnl(dev); if (!mdev) return -ENODEV; return mctp_neigh_remove(mdev, eid, MCTP_NEIGH_STATIC); } static int mctp_fill_neigh(struct sk_buff *skb, u32 portid, u32 seq, int event, unsigned int flags, struct mctp_neigh *neigh) { struct net_device *dev = neigh->dev->dev; struct nlmsghdr *nlh; struct ndmsg *hdr; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags); if (!nlh) return -EMSGSIZE; hdr = nlmsg_data(nlh); hdr->ndm_family = AF_MCTP; hdr->ndm_ifindex = dev->ifindex; hdr->ndm_state = 0; // TODO other state bits? if (neigh->source == MCTP_NEIGH_STATIC) hdr->ndm_state |= NUD_PERMANENT; hdr->ndm_flags = 0; hdr->ndm_type = RTN_UNICAST; // TODO: is loopback RTN_LOCAL? if (nla_put_u8(skb, NDA_DST, neigh->eid)) goto cancel; if (nla_put(skb, NDA_LLADDR, dev->addr_len, neigh->ha)) goto cancel; nlmsg_end(skb, nlh); return 0; cancel: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int mctp_rtm_getneigh(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int rc, idx, req_ifindex; struct mctp_neigh *neigh; struct ndmsg *ndmsg; struct { int idx; } *cbctx = (void *)cb->ctx; ndmsg = nlmsg_payload(cb->nlh, sizeof(*ndmsg)); if (!ndmsg) return -EINVAL; req_ifindex = ndmsg->ndm_ifindex; idx = 0; rcu_read_lock(); list_for_each_entry_rcu(neigh, &net->mctp.neighbours, list) { if (idx < cbctx->idx) goto cont; rc = 0; if (req_ifindex == 0 || req_ifindex == neigh->dev->dev->ifindex) rc = mctp_fill_neigh(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI, neigh); if (rc) break; cont: idx++; } rcu_read_unlock(); cbctx->idx = idx; return skb->len; } int mctp_neigh_lookup(struct mctp_dev *mdev, mctp_eid_t eid, void *ret_hwaddr) { struct net *net = dev_net(mdev->dev); struct mctp_neigh *neigh; int rc = -EHOSTUNREACH; // TODO: or ENOENT? rcu_read_lock(); list_for_each_entry_rcu(neigh, &net->mctp.neighbours, list) { if (mdev == neigh->dev && eid == neigh->eid) { if (ret_hwaddr) memcpy(ret_hwaddr, neigh->ha, sizeof(neigh->ha)); rc = 0; break; } } rcu_read_unlock(); return rc; } /* namespace registration */ static int __net_init mctp_neigh_net_init(struct net *net) { struct netns_mctp *ns = &net->mctp; INIT_LIST_HEAD(&ns->neighbours); mutex_init(&ns->neigh_lock); return 0; } static void __net_exit mctp_neigh_net_exit(struct net *net) { struct netns_mctp *ns = &net->mctp; struct mctp_neigh *neigh; list_for_each_entry(neigh, &ns->neighbours, list) call_rcu(&neigh->rcu, __mctp_neigh_free); } /* net namespace implementation */ static struct pernet_operations mctp_net_ops = { .init = mctp_neigh_net_init, .exit = mctp_neigh_net_exit, }; static const struct rtnl_msg_handler mctp_neigh_rtnl_msg_handlers[] = { {THIS_MODULE, PF_MCTP, RTM_NEWNEIGH, mctp_rtm_newneigh, NULL, 0}, {THIS_MODULE, PF_MCTP, RTM_DELNEIGH, mctp_rtm_delneigh, NULL, 0}, {THIS_MODULE, PF_MCTP, RTM_GETNEIGH, NULL, mctp_rtm_getneigh, 0}, }; int __init mctp_neigh_init(void) { int err; err = register_pernet_subsys(&mctp_net_ops); if (err) return err; err = rtnl_register_many(mctp_neigh_rtnl_msg_handlers); if (err) unregister_pernet_subsys(&mctp_net_ops); return err; } void mctp_neigh_exit(void) { rtnl_unregister_many(mctp_neigh_rtnl_msg_handlers); unregister_pernet_subsys(&mctp_net_ops); }
84 84 84 84 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 /* * Copyright (c) 2004 Topspin Communications. All rights reserved. * Copyright (c) 2005 Voltaire, Inc. All rights reserved. * Copyright (c) 2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/init.h> #include <linux/err.h> #include <linux/random.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/dma-mapping.h> #include <linux/kref.h> #include <linux/xarray.h> #include <linux/workqueue.h> #include <uapi/linux/if_ether.h> #include <rdma/ib_pack.h> #include <rdma/ib_cache.h> #include <rdma/rdma_netlink.h> #include <net/netlink.h> #include <uapi/rdma/ib_user_sa.h> #include <rdma/ib_marshall.h> #include <rdma/ib_addr.h> #include <rdma/opa_addr.h> #include <rdma/rdma_cm.h> #include "sa.h" #include "core_priv.h" #define IB_SA_LOCAL_SVC_TIMEOUT_MIN 100 #define IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT 2000 #define IB_SA_LOCAL_SVC_TIMEOUT_MAX 200000 #define IB_SA_CPI_MAX_RETRY_CNT 3 #define IB_SA_CPI_RETRY_WAIT 1000 /*msecs */ static int sa_local_svc_timeout_ms = IB_SA_LOCAL_SVC_TIMEOUT_DEFAULT; struct ib_sa_sm_ah { struct ib_ah *ah; struct kref ref; u16 pkey_index; u8 src_path_mask; }; enum rdma_class_port_info_type { RDMA_CLASS_PORT_INFO_IB, RDMA_CLASS_PORT_INFO_OPA }; struct rdma_class_port_info { enum rdma_class_port_info_type type; union { struct ib_class_port_info ib; struct opa_class_port_info opa; }; }; struct ib_sa_classport_cache { bool valid; int retry_cnt; struct rdma_class_port_info data; }; struct ib_sa_port { struct ib_mad_agent *agent; struct ib_sa_sm_ah *sm_ah; struct work_struct update_task; struct ib_sa_classport_cache classport_info; struct delayed_work ib_cpi_work; spinlock_t classport_lock; /* protects class port info set */ spinlock_t ah_lock; u32 port_num; }; struct ib_sa_device { int start_port, end_port; struct ib_event_handler event_handler; struct ib_sa_port port[]; }; struct ib_sa_query { void (*callback)(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad); void (*rmpp_callback)(struct ib_sa_query *sa_query, int status, struct ib_mad_recv_wc *mad); void (*release)(struct ib_sa_query *); struct ib_sa_client *client; struct ib_sa_port *port; struct ib_mad_send_buf *mad_buf; struct ib_sa_sm_ah *sm_ah; int id; u32 flags; struct list_head list; /* Local svc request list */ u32 seq; /* Local svc request sequence number */ unsigned long timeout; /* Local svc timeout */ u8 path_use; /* How will the pathrecord be used */ }; #define IB_SA_ENABLE_LOCAL_SERVICE 0x00000001 #define IB_SA_CANCEL 0x00000002 #define IB_SA_QUERY_OPA 0x00000004 struct ib_sa_path_query { void (*callback)(int status, struct sa_path_rec *rec, unsigned int num_paths, void *context); void *context; struct ib_sa_query sa_query; struct sa_path_rec *conv_pr; }; struct ib_sa_guidinfo_query { void (*callback)(int, struct ib_sa_guidinfo_rec *, void *); void *context; struct ib_sa_query sa_query; }; struct ib_sa_classport_info_query { void (*callback)(void *); void *context; struct ib_sa_query sa_query; }; struct ib_sa_mcmember_query { void (*callback)(int, struct ib_sa_mcmember_rec *, void *); void *context; struct ib_sa_query sa_query; }; struct ib_sa_service_query { void (*callback)(int status, struct sa_service_rec *rec, unsigned int num_services, void *context); void *context; struct ib_sa_query sa_query; }; static LIST_HEAD(ib_nl_request_list); static DEFINE_SPINLOCK(ib_nl_request_lock); static atomic_t ib_nl_sa_request_seq; static struct workqueue_struct *ib_nl_wq; static struct delayed_work ib_nl_timed_work; static const struct nla_policy ib_nl_policy[LS_NLA_TYPE_MAX] = { [LS_NLA_TYPE_PATH_RECORD] = {.type = NLA_BINARY, .len = sizeof(struct ib_path_rec_data)}, [LS_NLA_TYPE_TIMEOUT] = {.type = NLA_U32}, [LS_NLA_TYPE_SERVICE_ID] = {.type = NLA_U64}, [LS_NLA_TYPE_DGID] = {.type = NLA_BINARY, .len = sizeof(struct rdma_nla_ls_gid)}, [LS_NLA_TYPE_SGID] = {.type = NLA_BINARY, .len = sizeof(struct rdma_nla_ls_gid)}, [LS_NLA_TYPE_TCLASS] = {.type = NLA_U8}, [LS_NLA_TYPE_PKEY] = {.type = NLA_U16}, [LS_NLA_TYPE_QOS_CLASS] = {.type = NLA_U16}, }; static int ib_sa_add_one(struct ib_device *device); static void ib_sa_remove_one(struct ib_device *device, void *client_data); static struct ib_client sa_client = { .name = "sa", .add = ib_sa_add_one, .remove = ib_sa_remove_one }; static DEFINE_XARRAY_FLAGS(queries, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ); static DEFINE_SPINLOCK(tid_lock); static u32 tid; #define PATH_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct sa_path_rec, field), \ .struct_size_bytes = sizeof_field(struct sa_path_rec, field), \ .field_name = "sa_path_rec:" #field static const struct ib_field path_rec_table[] = { { PATH_REC_FIELD(service_id), .offset_words = 0, .offset_bits = 0, .size_bits = 64 }, { PATH_REC_FIELD(dgid), .offset_words = 2, .offset_bits = 0, .size_bits = 128 }, { PATH_REC_FIELD(sgid), .offset_words = 6, .offset_bits = 0, .size_bits = 128 }, { PATH_REC_FIELD(ib.dlid), .offset_words = 10, .offset_bits = 0, .size_bits = 16 }, { PATH_REC_FIELD(ib.slid), .offset_words = 10, .offset_bits = 16, .size_bits = 16 }, { PATH_REC_FIELD(ib.raw_traffic), .offset_words = 11, .offset_bits = 0, .size_bits = 1 }, { RESERVED, .offset_words = 11, .offset_bits = 1, .size_bits = 3 }, { PATH_REC_FIELD(flow_label), .offset_words = 11, .offset_bits = 4, .size_bits = 20 }, { PATH_REC_FIELD(hop_limit), .offset_words = 11, .offset_bits = 24, .size_bits = 8 }, { PATH_REC_FIELD(traffic_class), .offset_words = 12, .offset_bits = 0, .size_bits = 8 }, { PATH_REC_FIELD(reversible), .offset_words = 12, .offset_bits = 8, .size_bits = 1 }, { PATH_REC_FIELD(numb_path), .offset_words = 12, .offset_bits = 9, .size_bits = 7 }, { PATH_REC_FIELD(pkey), .offset_words = 12, .offset_bits = 16, .size_bits = 16 }, { PATH_REC_FIELD(qos_class), .offset_words = 13, .offset_bits = 0, .size_bits = 12 }, { PATH_REC_FIELD(sl), .offset_words = 13, .offset_bits = 12, .size_bits = 4 }, { PATH_REC_FIELD(mtu_selector), .offset_words = 13, .offset_bits = 16, .size_bits = 2 }, { PATH_REC_FIELD(mtu), .offset_words = 13, .offset_bits = 18, .size_bits = 6 }, { PATH_REC_FIELD(rate_selector), .offset_words = 13, .offset_bits = 24, .size_bits = 2 }, { PATH_REC_FIELD(rate), .offset_words = 13, .offset_bits = 26, .size_bits = 6 }, { PATH_REC_FIELD(packet_life_time_selector), .offset_words = 14, .offset_bits = 0, .size_bits = 2 }, { PATH_REC_FIELD(packet_life_time), .offset_words = 14, .offset_bits = 2, .size_bits = 6 }, { PATH_REC_FIELD(preference), .offset_words = 14, .offset_bits = 8, .size_bits = 8 }, { RESERVED, .offset_words = 14, .offset_bits = 16, .size_bits = 48 }, }; #define OPA_PATH_REC_FIELD(field) \ .struct_offset_bytes = \ offsetof(struct sa_path_rec, field), \ .struct_size_bytes = \ sizeof_field(struct sa_path_rec, field), \ .field_name = "sa_path_rec:" #field static const struct ib_field opa_path_rec_table[] = { { OPA_PATH_REC_FIELD(service_id), .offset_words = 0, .offset_bits = 0, .size_bits = 64 }, { OPA_PATH_REC_FIELD(dgid), .offset_words = 2, .offset_bits = 0, .size_bits = 128 }, { OPA_PATH_REC_FIELD(sgid), .offset_words = 6, .offset_bits = 0, .size_bits = 128 }, { OPA_PATH_REC_FIELD(opa.dlid), .offset_words = 10, .offset_bits = 0, .size_bits = 32 }, { OPA_PATH_REC_FIELD(opa.slid), .offset_words = 11, .offset_bits = 0, .size_bits = 32 }, { OPA_PATH_REC_FIELD(opa.raw_traffic), .offset_words = 12, .offset_bits = 0, .size_bits = 1 }, { RESERVED, .offset_words = 12, .offset_bits = 1, .size_bits = 3 }, { OPA_PATH_REC_FIELD(flow_label), .offset_words = 12, .offset_bits = 4, .size_bits = 20 }, { OPA_PATH_REC_FIELD(hop_limit), .offset_words = 12, .offset_bits = 24, .size_bits = 8 }, { OPA_PATH_REC_FIELD(traffic_class), .offset_words = 13, .offset_bits = 0, .size_bits = 8 }, { OPA_PATH_REC_FIELD(reversible), .offset_words = 13, .offset_bits = 8, .size_bits = 1 }, { OPA_PATH_REC_FIELD(numb_path), .offset_words = 13, .offset_bits = 9, .size_bits = 7 }, { OPA_PATH_REC_FIELD(pkey), .offset_words = 13, .offset_bits = 16, .size_bits = 16 }, { OPA_PATH_REC_FIELD(opa.l2_8B), .offset_words = 14, .offset_bits = 0, .size_bits = 1 }, { OPA_PATH_REC_FIELD(opa.l2_10B), .offset_words = 14, .offset_bits = 1, .size_bits = 1 }, { OPA_PATH_REC_FIELD(opa.l2_9B), .offset_words = 14, .offset_bits = 2, .size_bits = 1 }, { OPA_PATH_REC_FIELD(opa.l2_16B), .offset_words = 14, .offset_bits = 3, .size_bits = 1 }, { RESERVED, .offset_words = 14, .offset_bits = 4, .size_bits = 2 }, { OPA_PATH_REC_FIELD(opa.qos_type), .offset_words = 14, .offset_bits = 6, .size_bits = 2 }, { OPA_PATH_REC_FIELD(opa.qos_priority), .offset_words = 14, .offset_bits = 8, .size_bits = 8 }, { RESERVED, .offset_words = 14, .offset_bits = 16, .size_bits = 3 }, { OPA_PATH_REC_FIELD(sl), .offset_words = 14, .offset_bits = 19, .size_bits = 5 }, { RESERVED, .offset_words = 14, .offset_bits = 24, .size_bits = 8 }, { OPA_PATH_REC_FIELD(mtu_selector), .offset_words = 15, .offset_bits = 0, .size_bits = 2 }, { OPA_PATH_REC_FIELD(mtu), .offset_words = 15, .offset_bits = 2, .size_bits = 6 }, { OPA_PATH_REC_FIELD(rate_selector), .offset_words = 15, .offset_bits = 8, .size_bits = 2 }, { OPA_PATH_REC_FIELD(rate), .offset_words = 15, .offset_bits = 10, .size_bits = 6 }, { OPA_PATH_REC_FIELD(packet_life_time_selector), .offset_words = 15, .offset_bits = 16, .size_bits = 2 }, { OPA_PATH_REC_FIELD(packet_life_time), .offset_words = 15, .offset_bits = 18, .size_bits = 6 }, { OPA_PATH_REC_FIELD(preference), .offset_words = 15, .offset_bits = 24, .size_bits = 8 }, }; #define MCMEMBER_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field), \ .struct_size_bytes = sizeof_field(struct ib_sa_mcmember_rec, field), \ .field_name = "sa_mcmember_rec:" #field static const struct ib_field mcmember_rec_table[] = { { MCMEMBER_REC_FIELD(mgid), .offset_words = 0, .offset_bits = 0, .size_bits = 128 }, { MCMEMBER_REC_FIELD(port_gid), .offset_words = 4, .offset_bits = 0, .size_bits = 128 }, { MCMEMBER_REC_FIELD(qkey), .offset_words = 8, .offset_bits = 0, .size_bits = 32 }, { MCMEMBER_REC_FIELD(mlid), .offset_words = 9, .offset_bits = 0, .size_bits = 16 }, { MCMEMBER_REC_FIELD(mtu_selector), .offset_words = 9, .offset_bits = 16, .size_bits = 2 }, { MCMEMBER_REC_FIELD(mtu), .offset_words = 9, .offset_bits = 18, .size_bits = 6 }, { MCMEMBER_REC_FIELD(traffic_class), .offset_words = 9, .offset_bits = 24, .size_bits = 8 }, { MCMEMBER_REC_FIELD(pkey), .offset_words = 10, .offset_bits = 0, .size_bits = 16 }, { MCMEMBER_REC_FIELD(rate_selector), .offset_words = 10, .offset_bits = 16, .size_bits = 2 }, { MCMEMBER_REC_FIELD(rate), .offset_words = 10, .offset_bits = 18, .size_bits = 6 }, { MCMEMBER_REC_FIELD(packet_life_time_selector), .offset_words = 10, .offset_bits = 24, .size_bits = 2 }, { MCMEMBER_REC_FIELD(packet_life_time), .offset_words = 10, .offset_bits = 26, .size_bits = 6 }, { MCMEMBER_REC_FIELD(sl), .offset_words = 11, .offset_bits = 0, .size_bits = 4 }, { MCMEMBER_REC_FIELD(flow_label), .offset_words = 11, .offset_bits = 4, .size_bits = 20 }, { MCMEMBER_REC_FIELD(hop_limit), .offset_words = 11, .offset_bits = 24, .size_bits = 8 }, { MCMEMBER_REC_FIELD(scope), .offset_words = 12, .offset_bits = 0, .size_bits = 4 }, { MCMEMBER_REC_FIELD(join_state), .offset_words = 12, .offset_bits = 4, .size_bits = 4 }, { MCMEMBER_REC_FIELD(proxy_join), .offset_words = 12, .offset_bits = 8, .size_bits = 1 }, { RESERVED, .offset_words = 12, .offset_bits = 9, .size_bits = 23 }, }; #define CLASSPORTINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_class_port_info, field), \ .struct_size_bytes = sizeof_field(struct ib_class_port_info, field), \ .field_name = "ib_class_port_info:" #field static const struct ib_field ib_classport_info_rec_table[] = { { CLASSPORTINFO_REC_FIELD(base_version), .offset_words = 0, .offset_bits = 0, .size_bits = 8 }, { CLASSPORTINFO_REC_FIELD(class_version), .offset_words = 0, .offset_bits = 8, .size_bits = 8 }, { CLASSPORTINFO_REC_FIELD(capability_mask), .offset_words = 0, .offset_bits = 16, .size_bits = 16 }, { CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time), .offset_words = 1, .offset_bits = 0, .size_bits = 32 }, { CLASSPORTINFO_REC_FIELD(redirect_gid), .offset_words = 2, .offset_bits = 0, .size_bits = 128 }, { CLASSPORTINFO_REC_FIELD(redirect_tcslfl), .offset_words = 6, .offset_bits = 0, .size_bits = 32 }, { CLASSPORTINFO_REC_FIELD(redirect_lid), .offset_words = 7, .offset_bits = 0, .size_bits = 16 }, { CLASSPORTINFO_REC_FIELD(redirect_pkey), .offset_words = 7, .offset_bits = 16, .size_bits = 16 }, { CLASSPORTINFO_REC_FIELD(redirect_qp), .offset_words = 8, .offset_bits = 0, .size_bits = 32 }, { CLASSPORTINFO_REC_FIELD(redirect_qkey), .offset_words = 9, .offset_bits = 0, .size_bits = 32 }, { CLASSPORTINFO_REC_FIELD(trap_gid), .offset_words = 10, .offset_bits = 0, .size_bits = 128 }, { CLASSPORTINFO_REC_FIELD(trap_tcslfl), .offset_words = 14, .offset_bits = 0, .size_bits = 32 }, { CLASSPORTINFO_REC_FIELD(trap_lid), .offset_words = 15, .offset_bits = 0, .size_bits = 16 }, { CLASSPORTINFO_REC_FIELD(trap_pkey), .offset_words = 15, .offset_bits = 16, .size_bits = 16 }, { CLASSPORTINFO_REC_FIELD(trap_hlqp), .offset_words = 16, .offset_bits = 0, .size_bits = 32 }, { CLASSPORTINFO_REC_FIELD(trap_qkey), .offset_words = 17, .offset_bits = 0, .size_bits = 32 }, }; #define OPA_CLASSPORTINFO_REC_FIELD(field) \ .struct_offset_bytes =\ offsetof(struct opa_class_port_info, field), \ .struct_size_bytes = \ sizeof_field(struct opa_class_port_info, field), \ .field_name = "opa_class_port_info:" #field static const struct ib_field opa_classport_info_rec_table[] = { { OPA_CLASSPORTINFO_REC_FIELD(base_version), .offset_words = 0, .offset_bits = 0, .size_bits = 8 }, { OPA_CLASSPORTINFO_REC_FIELD(class_version), .offset_words = 0, .offset_bits = 8, .size_bits = 8 }, { OPA_CLASSPORTINFO_REC_FIELD(cap_mask), .offset_words = 0, .offset_bits = 16, .size_bits = 16 }, { OPA_CLASSPORTINFO_REC_FIELD(cap_mask2_resp_time), .offset_words = 1, .offset_bits = 0, .size_bits = 32 }, { OPA_CLASSPORTINFO_REC_FIELD(redirect_gid), .offset_words = 2, .offset_bits = 0, .size_bits = 128 }, { OPA_CLASSPORTINFO_REC_FIELD(redirect_tc_fl), .offset_words = 6, .offset_bits = 0, .size_bits = 32 }, { OPA_CLASSPORTINFO_REC_FIELD(redirect_lid), .offset_words = 7, .offset_bits = 0, .size_bits = 32 }, { OPA_CLASSPORTINFO_REC_FIELD(redirect_sl_qp), .offset_words = 8, .offset_bits = 0, .size_bits = 32 }, { OPA_CLASSPORTINFO_REC_FIELD(redirect_qkey), .offset_words = 9, .offset_bits = 0, .size_bits = 32 }, { OPA_CLASSPORTINFO_REC_FIELD(trap_gid), .offset_words = 10, .offset_bits = 0, .size_bits = 128 }, { OPA_CLASSPORTINFO_REC_FIELD(trap_tc_fl), .offset_words = 14, .offset_bits = 0, .size_bits = 32 }, { OPA_CLASSPORTINFO_REC_FIELD(trap_lid), .offset_words = 15, .offset_bits = 0, .size_bits = 32 }, { OPA_CLASSPORTINFO_REC_FIELD(trap_hl_qp), .offset_words = 16, .offset_bits = 0, .size_bits = 32 }, { OPA_CLASSPORTINFO_REC_FIELD(trap_qkey), .offset_words = 17, .offset_bits = 0, .size_bits = 32 }, { OPA_CLASSPORTINFO_REC_FIELD(trap_pkey), .offset_words = 18, .offset_bits = 0, .size_bits = 16 }, { OPA_CLASSPORTINFO_REC_FIELD(redirect_pkey), .offset_words = 18, .offset_bits = 16, .size_bits = 16 }, { OPA_CLASSPORTINFO_REC_FIELD(trap_sl_rsvd), .offset_words = 19, .offset_bits = 0, .size_bits = 8 }, { RESERVED, .offset_words = 19, .offset_bits = 8, .size_bits = 24 }, }; #define GUIDINFO_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field), \ .struct_size_bytes = sizeof_field(struct ib_sa_guidinfo_rec, field), \ .field_name = "sa_guidinfo_rec:" #field static const struct ib_field guidinfo_rec_table[] = { { GUIDINFO_REC_FIELD(lid), .offset_words = 0, .offset_bits = 0, .size_bits = 16 }, { GUIDINFO_REC_FIELD(block_num), .offset_words = 0, .offset_bits = 16, .size_bits = 8 }, { GUIDINFO_REC_FIELD(res1), .offset_words = 0, .offset_bits = 24, .size_bits = 8 }, { GUIDINFO_REC_FIELD(res2), .offset_words = 1, .offset_bits = 0, .size_bits = 32 }, { GUIDINFO_REC_FIELD(guid_info_list), .offset_words = 2, .offset_bits = 0, .size_bits = 512 }, }; #define SERVICE_REC_FIELD(field) \ .struct_offset_bytes = offsetof(struct sa_service_rec, field), \ .struct_size_bytes = sizeof_field(struct sa_service_rec, field), \ .field_name = "sa_service_rec:" #field static const struct ib_field service_rec_table[] = { { SERVICE_REC_FIELD(id), .offset_words = 0, .offset_bits = 0, .size_bits = 64 }, { SERVICE_REC_FIELD(gid), .offset_words = 2, .offset_bits = 0, .size_bits = 128 }, { SERVICE_REC_FIELD(pkey), .offset_words = 6, .offset_bits = 0, .size_bits = 16 }, { RESERVED, .offset_words = 6, .offset_bits = 16, .size_bits = 16 }, { SERVICE_REC_FIELD(lease), .offset_words = 7, .offset_bits = 0, .size_bits = 32 }, { SERVICE_REC_FIELD(key), .offset_words = 8, .offset_bits = 0, .size_bits = 128 }, { SERVICE_REC_FIELD(name), .offset_words = 12, .offset_bits = 0, .size_bits = 512 }, { SERVICE_REC_FIELD(data_8), .offset_words = 28, .offset_bits = 0, .size_bits = 128 }, { SERVICE_REC_FIELD(data_16), .offset_words = 32, .offset_bits = 0, .size_bits = 128 }, { SERVICE_REC_FIELD(data_32), .offset_words = 36, .offset_bits = 0, .size_bits = 128 }, { SERVICE_REC_FIELD(data_64), .offset_words = 40, .offset_bits = 0, .size_bits = 128 }, }; #define RDMA_PRIMARY_PATH_MAX_REC_NUM 3 static inline void ib_sa_disable_local_svc(struct ib_sa_query *query) { query->flags &= ~IB_SA_ENABLE_LOCAL_SERVICE; } static inline int ib_sa_query_cancelled(struct ib_sa_query *query) { return (query->flags & IB_SA_CANCEL); } static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, struct ib_sa_query *query) { struct sa_path_rec *sa_rec = query->mad_buf->context[1]; struct ib_sa_mad *mad = query->mad_buf->mad; ib_sa_comp_mask comp_mask = mad->sa_hdr.comp_mask; u16 val16; u64 val64; struct rdma_ls_resolve_header *header; query->mad_buf->context[1] = NULL; /* Construct the family header first */ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); strscpy_pad(header->device_name, dev_name(&query->port->agent->device->dev), LS_DEVICE_NAME_MAX); header->port_num = query->port->port_num; if ((comp_mask & IB_SA_PATH_REC_REVERSIBLE) && sa_rec->reversible != 0) query->path_use = LS_RESOLVE_PATH_USE_ALL; else query->path_use = LS_RESOLVE_PATH_USE_UNIDIRECTIONAL; header->path_use = query->path_use; /* Now build the attributes */ if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) { val64 = be64_to_cpu(sa_rec->service_id); nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SERVICE_ID, sizeof(val64), &val64); } if (comp_mask & IB_SA_PATH_REC_DGID) nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_DGID, sizeof(sa_rec->dgid), &sa_rec->dgid); if (comp_mask & IB_SA_PATH_REC_SGID) nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_SGID, sizeof(sa_rec->sgid), &sa_rec->sgid); if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS) nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_TCLASS, sizeof(sa_rec->traffic_class), &sa_rec->traffic_class); if (comp_mask & IB_SA_PATH_REC_PKEY) { val16 = be16_to_cpu(sa_rec->pkey); nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_PKEY, sizeof(val16), &val16); } if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) { val16 = be16_to_cpu(sa_rec->qos_class); nla_put(skb, RDMA_NLA_F_MANDATORY | LS_NLA_TYPE_QOS_CLASS, sizeof(val16), &val16); } } static int ib_nl_get_path_rec_attrs_len(ib_sa_comp_mask comp_mask) { int len = 0; if (comp_mask & IB_SA_PATH_REC_SERVICE_ID) len += nla_total_size(sizeof(u64)); if (comp_mask & IB_SA_PATH_REC_DGID) len += nla_total_size(sizeof(struct rdma_nla_ls_gid)); if (comp_mask & IB_SA_PATH_REC_SGID) len += nla_total_size(sizeof(struct rdma_nla_ls_gid)); if (comp_mask & IB_SA_PATH_REC_TRAFFIC_CLASS) len += nla_total_size(sizeof(u8)); if (comp_mask & IB_SA_PATH_REC_PKEY) len += nla_total_size(sizeof(u16)); if (comp_mask & IB_SA_PATH_REC_QOS_CLASS) len += nla_total_size(sizeof(u16)); /* * Make sure that at least some of the required comp_mask bits are * set. */ if (WARN_ON(len == 0)) return len; /* Add the family header */ len += NLMSG_ALIGN(sizeof(struct rdma_ls_resolve_header)); return len; } static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) { struct sk_buff *skb = NULL; struct nlmsghdr *nlh; void *data; struct ib_sa_mad *mad; int len; unsigned long flags; unsigned long delay; gfp_t gfp_flag; int ret; INIT_LIST_HEAD(&query->list); query->seq = (u32)atomic_inc_return(&ib_nl_sa_request_seq); mad = query->mad_buf->mad; len = ib_nl_get_path_rec_attrs_len(mad->sa_hdr.comp_mask); if (len <= 0) return -EMSGSIZE; skb = nlmsg_new(len, gfp_mask); if (!skb) return -ENOMEM; /* Put nlmsg header only for now */ data = ibnl_put_msg(skb, &nlh, query->seq, 0, RDMA_NL_LS, RDMA_NL_LS_OP_RESOLVE, NLM_F_REQUEST); if (!data) { nlmsg_free(skb); return -EMSGSIZE; } /* Add attributes */ ib_nl_set_path_rec_attrs(skb, query); /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); gfp_flag = ((gfp_mask & GFP_ATOMIC) == GFP_ATOMIC) ? GFP_ATOMIC : GFP_NOWAIT; spin_lock_irqsave(&ib_nl_request_lock, flags); ret = rdma_nl_multicast(&init_net, skb, RDMA_NL_GROUP_LS, gfp_flag); if (ret) goto out; /* Put the request on the list.*/ delay = msecs_to_jiffies(sa_local_svc_timeout_ms); query->timeout = delay + jiffies; list_add_tail(&query->list, &ib_nl_request_list); /* Start the timeout if this is the only request */ if (ib_nl_request_list.next == &query->list) queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); out: spin_unlock_irqrestore(&ib_nl_request_lock, flags); return ret; } static int ib_nl_cancel_request(struct ib_sa_query *query) { unsigned long flags; struct ib_sa_query *wait_query; int found = 0; spin_lock_irqsave(&ib_nl_request_lock, flags); list_for_each_entry(wait_query, &ib_nl_request_list, list) { /* Let the timeout to take care of the callback */ if (query == wait_query) { query->flags |= IB_SA_CANCEL; query->timeout = jiffies; list_move(&query->list, &ib_nl_request_list); found = 1; mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, 1); break; } } spin_unlock_irqrestore(&ib_nl_request_lock, flags); return found; } static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc); static void ib_nl_process_good_resolve_rsp(struct ib_sa_query *query, const struct nlmsghdr *nlh) { struct sa_path_rec recs[RDMA_PRIMARY_PATH_MAX_REC_NUM]; struct ib_sa_path_query *path_query; struct ib_path_rec_data *rec_data; struct ib_mad_send_wc mad_send_wc; const struct nlattr *head, *curr; struct ib_sa_mad *mad = NULL; int len, rem, status = -EIO; unsigned int num_prs = 0; u32 mask = 0; if (!query->callback) goto out; path_query = container_of(query, struct ib_sa_path_query, sa_query); mad = query->mad_buf->mad; head = (const struct nlattr *) nlmsg_data(nlh); len = nlmsg_len(nlh); switch (query->path_use) { case LS_RESOLVE_PATH_USE_UNIDIRECTIONAL: mask = IB_PATH_PRIMARY | IB_PATH_OUTBOUND; break; case LS_RESOLVE_PATH_USE_ALL: mask = IB_PATH_PRIMARY; break; case LS_RESOLVE_PATH_USE_GMP: default: mask = IB_PATH_PRIMARY | IB_PATH_GMP | IB_PATH_BIDIRECTIONAL; break; } nla_for_each_attr(curr, head, len, rem) { if (curr->nla_type != LS_NLA_TYPE_PATH_RECORD) continue; rec_data = nla_data(curr); if ((rec_data->flags & mask) != mask) continue; if ((query->flags & IB_SA_QUERY_OPA) || path_query->conv_pr) { mad->mad_hdr.method |= IB_MGMT_METHOD_RESP; memcpy(mad->data, rec_data->path_rec, sizeof(rec_data->path_rec)); query->callback(query, 0, mad); goto out; } status = 0; ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), rec_data->path_rec, &recs[num_prs]); recs[num_prs].flags = rec_data->flags; recs[num_prs].rec_type = SA_PATH_REC_TYPE_IB; sa_path_set_dmac_zero(&recs[num_prs]); num_prs++; if (num_prs >= RDMA_PRIMARY_PATH_MAX_REC_NUM) break; } if (!status) { mad->mad_hdr.method |= IB_MGMT_METHOD_RESP; path_query->callback(status, recs, num_prs, path_query->context); } else query->callback(query, status, mad); out: mad_send_wc.send_buf = query->mad_buf; mad_send_wc.status = IB_WC_SUCCESS; send_handler(query->mad_buf->mad_agent, &mad_send_wc); } static void ib_nl_request_timeout(struct work_struct *work) { unsigned long flags; struct ib_sa_query *query; unsigned long delay; struct ib_mad_send_wc mad_send_wc; int ret; spin_lock_irqsave(&ib_nl_request_lock, flags); while (!list_empty(&ib_nl_request_list)) { query = list_entry(ib_nl_request_list.next, struct ib_sa_query, list); if (time_after(query->timeout, jiffies)) { delay = query->timeout - jiffies; if ((long)delay <= 0) delay = 1; queue_delayed_work(ib_nl_wq, &ib_nl_timed_work, delay); break; } list_del(&query->list); ib_sa_disable_local_svc(query); /* Hold the lock to protect against query cancellation */ if (ib_sa_query_cancelled(query)) ret = -1; else ret = ib_post_send_mad(query->mad_buf, NULL); if (ret) { mad_send_wc.send_buf = query->mad_buf; mad_send_wc.status = IB_WC_WR_FLUSH_ERR; spin_unlock_irqrestore(&ib_nl_request_lock, flags); send_handler(query->port->agent, &mad_send_wc); spin_lock_irqsave(&ib_nl_request_lock, flags); } } spin_unlock_irqrestore(&ib_nl_request_lock, flags); } int ib_nl_handle_set_timeout(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { int timeout, delta, abs_delta; const struct nlattr *attr; unsigned long flags; struct ib_sa_query *query; long delay = 0; struct nlattr *tb[LS_NLA_TYPE_MAX]; int ret; if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || !(NETLINK_CB(skb).sk)) return -EPERM; ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), nlmsg_len(nlh), ib_nl_policy, NULL); attr = (const struct nlattr *)tb[LS_NLA_TYPE_TIMEOUT]; if (ret || !attr) goto settimeout_out; timeout = *(int *) nla_data(attr); if (timeout < IB_SA_LOCAL_SVC_TIMEOUT_MIN) timeout = IB_SA_LOCAL_SVC_TIMEOUT_MIN; if (timeout > IB_SA_LOCAL_SVC_TIMEOUT_MAX) timeout = IB_SA_LOCAL_SVC_TIMEOUT_MAX; spin_lock_irqsave(&ib_nl_request_lock, flags); delta = timeout - sa_local_svc_timeout_ms; if (delta < 0) abs_delta = -delta; else abs_delta = delta; if (delta != 0) { sa_local_svc_timeout_ms = timeout; list_for_each_entry(query, &ib_nl_request_list, list) { if (delta < 0 && abs_delta > query->timeout) query->timeout = 0; else query->timeout += delta; /* Get the new delay from the first entry */ if (!delay) { delay = query->timeout - jiffies; if (delay <= 0) delay = 1; } } if (delay) mod_delayed_work(ib_nl_wq, &ib_nl_timed_work, (unsigned long)delay); } spin_unlock_irqrestore(&ib_nl_request_lock, flags); settimeout_out: return 0; } static inline int ib_nl_is_good_resolve_resp(const struct nlmsghdr *nlh) { struct nlattr *tb[LS_NLA_TYPE_MAX]; int ret; if (nlh->nlmsg_flags & RDMA_NL_LS_F_ERR) return 0; ret = nla_parse_deprecated(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), nlmsg_len(nlh), ib_nl_policy, NULL); if (ret) return 0; return 1; } int ib_nl_handle_resolve_resp(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { unsigned long flags; struct ib_sa_query *query = NULL, *iter; struct ib_mad_send_buf *send_buf; struct ib_mad_send_wc mad_send_wc; int ret; if ((nlh->nlmsg_flags & NLM_F_REQUEST) || !(NETLINK_CB(skb).sk)) return -EPERM; spin_lock_irqsave(&ib_nl_request_lock, flags); list_for_each_entry(iter, &ib_nl_request_list, list) { /* * If the query is cancelled, let the timeout routine * take care of it. */ if (nlh->nlmsg_seq == iter->seq) { if (!ib_sa_query_cancelled(iter)) { list_del(&iter->list); query = iter; } break; } } if (!query) { spin_unlock_irqrestore(&ib_nl_request_lock, flags); goto resp_out; } send_buf = query->mad_buf; if (!ib_nl_is_good_resolve_resp(nlh)) { /* if the result is a failure, send out the packet via IB */ ib_sa_disable_local_svc(query); ret = ib_post_send_mad(query->mad_buf, NULL); spin_unlock_irqrestore(&ib_nl_request_lock, flags); if (ret) { mad_send_wc.send_buf = send_buf; mad_send_wc.status = IB_WC_GENERAL_ERR; send_handler(query->port->agent, &mad_send_wc); } } else { spin_unlock_irqrestore(&ib_nl_request_lock, flags); ib_nl_process_good_resolve_rsp(query, nlh); } resp_out: return 0; } static void free_sm_ah(struct kref *kref) { struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref); rdma_destroy_ah(sm_ah->ah, 0); kfree(sm_ah); } void ib_sa_register_client(struct ib_sa_client *client) { atomic_set(&client->users, 1); init_completion(&client->comp); } EXPORT_SYMBOL(ib_sa_register_client); void ib_sa_unregister_client(struct ib_sa_client *client) { ib_sa_client_put(client); wait_for_completion(&client->comp); } EXPORT_SYMBOL(ib_sa_unregister_client); /** * ib_sa_cancel_query - try to cancel an SA query * @id:ID of query to cancel * @query:query pointer to cancel * * Try to cancel an SA query. If the id and query don't match up or * the query has already completed, nothing is done. Otherwise the * query is canceled and will complete with a status of -EINTR. */ void ib_sa_cancel_query(int id, struct ib_sa_query *query) { unsigned long flags; struct ib_mad_send_buf *mad_buf; xa_lock_irqsave(&queries, flags); if (xa_load(&queries, id) != query) { xa_unlock_irqrestore(&queries, flags); return; } mad_buf = query->mad_buf; xa_unlock_irqrestore(&queries, flags); /* * If the query is still on the netlink request list, schedule * it to be cancelled by the timeout routine. Otherwise, it has been * sent to the MAD layer and has to be cancelled from there. */ if (!ib_nl_cancel_request(query)) ib_cancel_mad(mad_buf); } EXPORT_SYMBOL(ib_sa_cancel_query); static u8 get_src_path_mask(struct ib_device *device, u32 port_num) { struct ib_sa_device *sa_dev; struct ib_sa_port *port; unsigned long flags; u8 src_path_mask; sa_dev = ib_get_client_data(device, &sa_client); if (!sa_dev) return 0x7f; port = &sa_dev->port[port_num - sa_dev->start_port]; spin_lock_irqsave(&port->ah_lock, flags); src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f; spin_unlock_irqrestore(&port->ah_lock, flags); return src_path_mask; } static int init_ah_attr_grh_fields(struct ib_device *device, u32 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr, const struct ib_gid_attr *gid_attr) { enum ib_gid_type type = sa_conv_pathrec_to_gid_type(rec); if (!gid_attr) { gid_attr = rdma_find_gid_by_port(device, &rec->sgid, type, port_num, NULL); if (IS_ERR(gid_attr)) return PTR_ERR(gid_attr); } else rdma_hold_gid_attr(gid_attr); rdma_move_grh_sgid_attr(ah_attr, &rec->dgid, be32_to_cpu(rec->flow_label), rec->hop_limit, rec->traffic_class, gid_attr); return 0; } /** * ib_init_ah_attr_from_path - Initialize address handle attributes based on * an SA path record. * @device: Device associated ah attributes initialization. * @port_num: Port on the specified device. * @rec: path record entry to use for ah attributes initialization. * @ah_attr: address handle attributes to initialization from path record. * @gid_attr: SGID attribute to consider during initialization. * * When ib_init_ah_attr_from_path() returns success, * (a) for IB link layer it optionally contains a reference to SGID attribute * when GRH is present for IB link layer. * (b) for RoCE link layer it contains a reference to SGID attribute. * User must invoke rdma_destroy_ah_attr() to release reference to SGID * attributes which are initialized using ib_init_ah_attr_from_path(). */ int ib_init_ah_attr_from_path(struct ib_device *device, u32 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr, const struct ib_gid_attr *gid_attr) { int ret = 0; memset(ah_attr, 0, sizeof(*ah_attr)); ah_attr->type = rdma_ah_find_type(device, port_num); rdma_ah_set_sl(ah_attr, rec->sl); rdma_ah_set_port_num(ah_attr, port_num); rdma_ah_set_static_rate(ah_attr, rec->rate); if (sa_path_is_roce(rec)) { ret = roce_resolve_route_from_path(rec, gid_attr); if (ret) return ret; memcpy(ah_attr->roce.dmac, sa_path_get_dmac(rec), ETH_ALEN); } else { rdma_ah_set_dlid(ah_attr, be32_to_cpu(sa_path_get_dlid(rec))); if (sa_path_is_opa(rec) && rdma_ah_get_dlid(ah_attr) == be16_to_cpu(IB_LID_PERMISSIVE)) rdma_ah_set_make_grd(ah_attr, true); rdma_ah_set_path_bits(ah_attr, be32_to_cpu(sa_path_get_slid(rec)) & get_src_path_mask(device, port_num)); } if (rec->hop_limit > 0 || sa_path_is_roce(rec)) ret = init_ah_attr_grh_fields(device, port_num, rec, ah_attr, gid_attr); return ret; } EXPORT_SYMBOL(ib_init_ah_attr_from_path); static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask) { struct rdma_ah_attr ah_attr; unsigned long flags; spin_lock_irqsave(&query->port->ah_lock, flags); if (!query->port->sm_ah) { spin_unlock_irqrestore(&query->port->ah_lock, flags); return -EAGAIN; } kref_get(&query->port->sm_ah->ref); query->sm_ah = query->port->sm_ah; spin_unlock_irqrestore(&query->port->ah_lock, flags); /* * Always check if sm_ah has valid dlid assigned, * before querying for class port info */ if ((rdma_query_ah(query->sm_ah->ah, &ah_attr) < 0) || !rdma_is_valid_unicast_lid(&ah_attr)) { kref_put(&query->sm_ah->ref, free_sm_ah); return -EAGAIN; } query->mad_buf = ib_create_send_mad(query->port->agent, 1, query->sm_ah->pkey_index, 0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA, gfp_mask, ((query->flags & IB_SA_QUERY_OPA) ? OPA_MGMT_BASE_VERSION : IB_MGMT_BASE_VERSION)); if (IS_ERR(query->mad_buf)) { kref_put(&query->sm_ah->ref, free_sm_ah); return -ENOMEM; } query->mad_buf->ah = query->sm_ah->ah; return 0; } static void free_mad(struct ib_sa_query *query) { ib_free_send_mad(query->mad_buf); kref_put(&query->sm_ah->ref, free_sm_ah); } static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent) { struct ib_sa_mad *mad = query->mad_buf->mad; unsigned long flags; memset(mad, 0, sizeof *mad); if (query->flags & IB_SA_QUERY_OPA) { mad->mad_hdr.base_version = OPA_MGMT_BASE_VERSION; mad->mad_hdr.class_version = OPA_SA_CLASS_VERSION; } else { mad->mad_hdr.base_version = IB_MGMT_BASE_VERSION; mad->mad_hdr.class_version = IB_SA_CLASS_VERSION; } mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM; spin_lock_irqsave(&tid_lock, flags); mad->mad_hdr.tid = cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++); spin_unlock_irqrestore(&tid_lock, flags); } static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, gfp_t gfp_mask) { unsigned long flags; int ret, id; const int nmbr_sa_query_retries = 10; xa_lock_irqsave(&queries, flags); ret = __xa_alloc(&queries, &id, query, xa_limit_32b, gfp_mask); xa_unlock_irqrestore(&queries, flags); if (ret < 0) return ret; query->mad_buf->timeout_ms = timeout_ms / nmbr_sa_query_retries; query->mad_buf->retries = nmbr_sa_query_retries; if (!query->mad_buf->timeout_ms) { /* Special case, very small timeout_ms */ query->mad_buf->timeout_ms = 1; query->mad_buf->retries = timeout_ms; } query->mad_buf->context[0] = query; query->id = id; if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) && (!(query->flags & IB_SA_QUERY_OPA))) { if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { if (!ib_nl_make_request(query, gfp_mask)) return id; } ib_sa_disable_local_svc(query); } ret = ib_post_send_mad(query->mad_buf, NULL); if (ret) { xa_lock_irqsave(&queries, flags); __xa_erase(&queries, id); xa_unlock_irqrestore(&queries, flags); } /* * It's not safe to dereference query any more, because the * send may already have completed and freed the query in * another context. */ return ret ? ret : id; } void ib_sa_unpack_path(void *attribute, struct sa_path_rec *rec) { ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec); } EXPORT_SYMBOL(ib_sa_unpack_path); void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute) { ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute); } EXPORT_SYMBOL(ib_sa_pack_path); void ib_sa_pack_service(struct sa_service_rec *rec, void *attribute) { ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table), rec, attribute); } EXPORT_SYMBOL(ib_sa_pack_service); void ib_sa_unpack_service(void *attribute, struct sa_service_rec *rec) { ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table), attribute, rec); } EXPORT_SYMBOL(ib_sa_unpack_service); static bool ib_sa_opa_pathrecord_support(struct ib_sa_client *client, struct ib_sa_device *sa_dev, u32 port_num) { struct ib_sa_port *port; unsigned long flags; bool ret = false; port = &sa_dev->port[port_num - sa_dev->start_port]; spin_lock_irqsave(&port->classport_lock, flags); if (!port->classport_info.valid) goto ret; if (port->classport_info.data.type == RDMA_CLASS_PORT_INFO_OPA) ret = opa_get_cpi_capmask2(&port->classport_info.data.opa) & OPA_CLASS_PORT_INFO_PR_SUPPORT; ret: spin_unlock_irqrestore(&port->classport_lock, flags); return ret; } enum opa_pr_supported { PR_NOT_SUPPORTED, PR_OPA_SUPPORTED, PR_IB_SUPPORTED }; /* * opa_pr_query_possible - Check if current PR query can be an OPA query. * * Returns PR_NOT_SUPPORTED if a path record query is not * possible, PR_OPA_SUPPORTED if an OPA path record query * is possible and PR_IB_SUPPORTED if an IB path record * query is possible. */ static int opa_pr_query_possible(struct ib_sa_client *client, struct ib_sa_device *sa_dev, struct ib_device *device, u32 port_num) { struct ib_port_attr port_attr; if (ib_query_port(device, port_num, &port_attr)) return PR_NOT_SUPPORTED; if (ib_sa_opa_pathrecord_support(client, sa_dev, port_num)) return PR_OPA_SUPPORTED; if (port_attr.lid >= be16_to_cpu(IB_MULTICAST_LID_BASE)) return PR_NOT_SUPPORTED; else return PR_IB_SUPPORTED; } static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { struct ib_sa_path_query *query = container_of(sa_query, struct ib_sa_path_query, sa_query); struct sa_path_rec rec = {}; if (!mad) { query->callback(status, NULL, 0, query->context); return; } if (sa_query->flags & IB_SA_QUERY_OPA) { ib_unpack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table), mad->data, &rec); rec.rec_type = SA_PATH_REC_TYPE_OPA; query->callback(status, &rec, 1, query->context); return; } ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), mad->data, &rec); rec.rec_type = SA_PATH_REC_TYPE_IB; sa_path_set_dmac_zero(&rec); if (query->conv_pr) { struct sa_path_rec opa; memset(&opa, 0, sizeof(struct sa_path_rec)); sa_convert_path_ib_to_opa(&opa, &rec); query->callback(status, &opa, 1, query->context); } else { query->callback(status, &rec, 1, query->context); } } #define IB_SA_DATA_OFFS 56 #define IB_SERVICE_REC_SZ 176 static void ib_unpack_service_rmpp(struct sa_service_rec *rec, struct ib_mad_recv_wc *mad_wc, int num_services) { unsigned int cp_sz, data_i, data_size, rec_i = 0, buf_i = 0; struct ib_mad_recv_buf *mad_buf; u8 buf[IB_SERVICE_REC_SZ]; u8 *data; data_size = sizeof(((struct ib_sa_mad *) mad_buf->mad)->data); list_for_each_entry(mad_buf, &mad_wc->rmpp_list, list) { data = ((struct ib_sa_mad *) mad_buf->mad)->data; data_i = 0; while (data_i < data_size && rec_i < num_services) { cp_sz = min(IB_SERVICE_REC_SZ - buf_i, data_size - data_i); memcpy(buf + buf_i, data + data_i, cp_sz); data_i += cp_sz; buf_i += cp_sz; if (buf_i == IB_SERVICE_REC_SZ) { ib_sa_unpack_service(buf, rec + rec_i); buf_i = 0; rec_i++; } } } } static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_mad_recv_wc *mad_wc) { struct ib_sa_service_query *query = container_of(sa_query, struct ib_sa_service_query, sa_query); struct sa_service_rec *rec; int num_services; if (!mad_wc || !mad_wc->recv_buf.mad) { query->callback(status, NULL, 0, query->context); return; } num_services = (mad_wc->mad_len - IB_SA_DATA_OFFS) / IB_SERVICE_REC_SZ; if (!num_services) { query->callback(-ENODATA, NULL, 0, query->context); return; } rec = kmalloc_array(num_services, sizeof(*rec), GFP_KERNEL); if (!rec) { query->callback(-ENOMEM, NULL, 0, query->context); return; } ib_unpack_service_rmpp(rec, mad_wc, num_services); query->callback(status, rec, num_services, query->context); kfree(rec); } static void ib_sa_path_rec_release(struct ib_sa_query *sa_query) { struct ib_sa_path_query *query = container_of(sa_query, struct ib_sa_path_query, sa_query); kfree(query->conv_pr); kfree(query); } static void ib_sa_service_rec_release(struct ib_sa_query *sa_query) { struct ib_sa_service_query *query = container_of(sa_query, struct ib_sa_service_query, sa_query); kfree(query); } /** * ib_sa_path_rec_get - Start a Path get query * @client:SA client * @device:device to send query on * @port_num: port number to send query on * @rec:Path Record to send in query * @comp_mask:component mask to send in query * @timeout_ms:time to wait for response * @gfp_mask:GFP mask to use for internal allocations * @callback:function called when query completes, times out or is * canceled * @context:opaque user context passed to callback * @sa_query:query context, used to cancel query * * Send a Path Record Get query to the SA to look up a path. The * callback function will be called when the query completes (or * fails); status is 0 for a successful response, -EINTR if the query * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error * occurred sending the query. The resp parameter of the callback is * only valid if status is 0. * * If the return value of ib_sa_path_rec_get() is negative, it is an * error code. Otherwise it is a query ID that can be used to cancel * the query. */ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u32 port_num, struct sa_path_rec *rec, ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, unsigned int num_paths, void *context), void *context, struct ib_sa_query **sa_query) { struct ib_sa_path_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; enum opa_pr_supported status; int ret; if (!sa_dev) return -ENODEV; if ((rec->rec_type != SA_PATH_REC_TYPE_IB) && (rec->rec_type != SA_PATH_REC_TYPE_OPA)) return -EINVAL; port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { status = opa_pr_query_possible(client, sa_dev, device, port_num); if (status == PR_NOT_SUPPORTED) { ret = -EINVAL; goto err1; } else if (status == PR_OPA_SUPPORTED) { query->sa_query.flags |= IB_SA_QUERY_OPA; } else { query->conv_pr = kmalloc(sizeof(*query->conv_pr), gfp_mask); if (!query->conv_pr) { ret = -ENOMEM; goto err1; } } } ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err2; ib_sa_client_get(client); query->sa_query.client = client; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(&query->sa_query, agent); query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL; query->sa_query.release = ib_sa_path_rec_release; mad->mad_hdr.method = IB_MGMT_METHOD_GET; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_PATH_REC); mad->sa_hdr.comp_mask = comp_mask; if (query->sa_query.flags & IB_SA_QUERY_OPA) { ib_pack(opa_path_rec_table, ARRAY_SIZE(opa_path_rec_table), rec, mad->data); } else if (query->conv_pr) { sa_convert_path_opa_to_ib(query->conv_pr, rec); ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), query->conv_pr, mad->data); } else { ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, mad->data); } *sa_query = &query->sa_query; query->sa_query.flags |= IB_SA_ENABLE_LOCAL_SERVICE; query->sa_query.mad_buf->context[1] = (query->conv_pr) ? query->conv_pr : rec; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err3; return ret; err3: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err2: kfree(query->conv_pr); err1: kfree(query); return ret; } EXPORT_SYMBOL(ib_sa_path_rec_get); /** * ib_sa_service_rec_get - Start a Service get query * @client: SA client * @device: device to send query on * @port_num: port number to send query on * @rec: Service Record to send in query * @comp_mask: component mask to send in query * @timeout_ms: time to wait for response * @gfp_mask: GFP mask to use for internal allocations * @callback: function called when query completes, times out or is * canceled * @context: opaque user context passed to callback * @sa_query: query context, used to cancel query * * Send a Service Record Get query to the SA to look up a path. The * callback function will be called when the query completes (or * fails); status is 0 for a successful response, -EINTR if the query * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error * occurred sending the query. The resp parameter of the callback is * only valid if status is 0. * * If the return value of ib_sa_service_rec_get() is negative, it is an * error code. Otherwise it is a query ID that can be used to cancel * the query. */ int ib_sa_service_rec_get(struct ib_sa_client *client, struct ib_device *device, u32 port_num, struct sa_service_rec *rec, ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_service_rec *resp, unsigned int num_services, void *context), void *context, struct ib_sa_query **sa_query) { struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_service_query *query; struct ib_mad_agent *agent; struct ib_sa_port *port; struct ib_sa_mad *mad; int ret; if (!sa_dev) return -ENODEV; port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(&query->sa_query, agent); query->sa_query.rmpp_callback = callback ? ib_sa_service_rec_callback : NULL; query->sa_query.release = ib_sa_service_rec_release; mad->mad_hdr.method = IB_MGMT_METHOD_GET_TABLE; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_SERVICE_REC); mad->sa_hdr.comp_mask = comp_mask; ib_sa_pack_service(rec, mad->data); *sa_query = &query->sa_query; query->sa_query.mad_buf->context[1] = rec; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; return ret; err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err1: kfree(query); return ret; } EXPORT_SYMBOL(ib_sa_service_rec_get); static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { struct ib_sa_mcmember_query *query = container_of(sa_query, struct ib_sa_mcmember_query, sa_query); if (mad) { struct ib_sa_mcmember_rec rec; ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table), mad->data, &rec); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); } static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query)); } int ib_sa_mcmember_rec_query(struct ib_sa_client *client, struct ib_device *device, u32 port_num, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), void *context, struct ib_sa_query **sa_query) { struct ib_sa_mcmember_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; int ret; if (!sa_dev) return -ENODEV; port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(&query->sa_query, agent); query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL; query->sa_query.release = ib_sa_mcmember_rec_release; mad->mad_hdr.method = method; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC); mad->sa_hdr.comp_mask = comp_mask; ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table), rec, mad->data); *sa_query = &query->sa_query; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; return ret; err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err1: kfree(query); return ret; } /* Support GuidInfoRecord */ static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { struct ib_sa_guidinfo_query *query = container_of(sa_query, struct ib_sa_guidinfo_query, sa_query); if (mad) { struct ib_sa_guidinfo_rec rec; ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), mad->data, &rec); query->callback(status, &rec, query->context); } else query->callback(status, NULL, query->context); } static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query)); } int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u32 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), void *context, struct ib_sa_query **sa_query) { struct ib_sa_guidinfo_query *query; struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client); struct ib_sa_port *port; struct ib_mad_agent *agent; struct ib_sa_mad *mad; int ret; if (!sa_dev) return -ENODEV; if (method != IB_MGMT_METHOD_GET && method != IB_MGMT_METHOD_SET && method != IB_SA_METHOD_DELETE) { return -EINVAL; } port = &sa_dev->port[port_num - sa_dev->start_port]; agent = port->agent; query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err1; ib_sa_client_get(client); query->sa_query.client = client; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(&query->sa_query, agent); query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL; query->sa_query.release = ib_sa_guidinfo_rec_release; mad->mad_hdr.method = method; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC); mad->sa_hdr.comp_mask = comp_mask; ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec, mad->data); *sa_query = &query->sa_query; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err2; return ret; err2: *sa_query = NULL; ib_sa_client_put(query->sa_query.client); free_mad(&query->sa_query); err1: kfree(query); return ret; } EXPORT_SYMBOL(ib_sa_guid_info_rec_query); struct ib_classport_info_context { struct completion done; struct ib_sa_query *sa_query; }; static void ib_classportinfo_cb(void *context) { struct ib_classport_info_context *cb_ctx = context; complete(&cb_ctx->done); } static void ib_sa_classport_info_rec_callback(struct ib_sa_query *sa_query, int status, struct ib_sa_mad *mad) { unsigned long flags; struct ib_sa_classport_info_query *query = container_of(sa_query, struct ib_sa_classport_info_query, sa_query); struct ib_sa_classport_cache *info = &sa_query->port->classport_info; if (mad) { if (sa_query->flags & IB_SA_QUERY_OPA) { struct opa_class_port_info rec; ib_unpack(opa_classport_info_rec_table, ARRAY_SIZE(opa_classport_info_rec_table), mad->data, &rec); spin_lock_irqsave(&sa_query->port->classport_lock, flags); if (!status && !info->valid) { memcpy(&info->data.opa, &rec, sizeof(info->data.opa)); info->valid = true; info->data.type = RDMA_CLASS_PORT_INFO_OPA; } spin_unlock_irqrestore(&sa_query->port->classport_lock, flags); } else { struct ib_class_port_info rec; ib_unpack(ib_classport_info_rec_table, ARRAY_SIZE(ib_classport_info_rec_table), mad->data, &rec); spin_lock_irqsave(&sa_query->port->classport_lock, flags); if (!status && !info->valid) { memcpy(&info->data.ib, &rec, sizeof(info->data.ib)); info->valid = true; info->data.type = RDMA_CLASS_PORT_INFO_IB; } spin_unlock_irqrestore(&sa_query->port->classport_lock, flags); } } query->callback(query->context); } static void ib_sa_classport_info_rec_release(struct ib_sa_query *sa_query) { kfree(container_of(sa_query, struct ib_sa_classport_info_query, sa_query)); } static int ib_sa_classport_info_rec_query(struct ib_sa_port *port, unsigned long timeout_ms, void (*callback)(void *context), void *context, struct ib_sa_query **sa_query) { struct ib_mad_agent *agent; struct ib_sa_classport_info_query *query; struct ib_sa_mad *mad; gfp_t gfp_mask = GFP_KERNEL; int ret; agent = port->agent; query = kzalloc(sizeof(*query), gfp_mask); if (!query) return -ENOMEM; query->sa_query.port = port; query->sa_query.flags |= rdma_cap_opa_ah(port->agent->device, port->port_num) ? IB_SA_QUERY_OPA : 0; ret = alloc_mad(&query->sa_query, gfp_mask); if (ret) goto err_free; query->callback = callback; query->context = context; mad = query->sa_query.mad_buf->mad; init_mad(&query->sa_query, agent); query->sa_query.callback = ib_sa_classport_info_rec_callback; query->sa_query.release = ib_sa_classport_info_rec_release; mad->mad_hdr.method = IB_MGMT_METHOD_GET; mad->mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_CLASS_PORTINFO); mad->sa_hdr.comp_mask = 0; *sa_query = &query->sa_query; ret = send_mad(&query->sa_query, timeout_ms, gfp_mask); if (ret < 0) goto err_free_mad; return ret; err_free_mad: *sa_query = NULL; free_mad(&query->sa_query); err_free: kfree(query); return ret; } static void update_ib_cpi(struct work_struct *work) { struct ib_sa_port *port = container_of(work, struct ib_sa_port, ib_cpi_work.work); struct ib_classport_info_context *cb_context; unsigned long flags; int ret; /* If the classport info is valid, nothing * to do here. */ spin_lock_irqsave(&port->classport_lock, flags); if (port->classport_info.valid) { spin_unlock_irqrestore(&port->classport_lock, flags); return; } spin_unlock_irqrestore(&port->classport_lock, flags); cb_context = kmalloc(sizeof(*cb_context), GFP_KERNEL); if (!cb_context) goto err_nomem; init_completion(&cb_context->done); ret = ib_sa_classport_info_rec_query(port, 3000, ib_classportinfo_cb, cb_context, &cb_context->sa_query); if (ret < 0) goto free_cb_err; wait_for_completion(&cb_context->done); free_cb_err: kfree(cb_context); spin_lock_irqsave(&port->classport_lock, flags); /* If the classport info is still not valid, the query should have * failed for some reason. Retry issuing the query */ if (!port->classport_info.valid) { port->classport_info.retry_cnt++; if (port->classport_info.retry_cnt <= IB_SA_CPI_MAX_RETRY_CNT) { unsigned long delay = msecs_to_jiffies(IB_SA_CPI_RETRY_WAIT); queue_delayed_work(ib_wq, &port->ib_cpi_work, delay); } } spin_unlock_irqrestore(&port->classport_lock, flags); err_nomem: return; } static void send_handler(struct ib_mad_agent *agent, struct ib_mad_send_wc *mad_send_wc) { struct ib_sa_query *query = mad_send_wc->send_buf->context[0]; unsigned long flags; int status = 0; if (query->callback || query->rmpp_callback) { switch (mad_send_wc->status) { case IB_WC_SUCCESS: /* No callback -- already got recv */ break; case IB_WC_RESP_TIMEOUT_ERR: status = -ETIMEDOUT; break; case IB_WC_WR_FLUSH_ERR: status = -EINTR; break; default: status = -EIO; break; } if (status) query->callback ? query->callback(query, status, NULL) : query->rmpp_callback(query, status, NULL); } xa_lock_irqsave(&queries, flags); __xa_erase(&queries, query->id); xa_unlock_irqrestore(&queries, flags); free_mad(query); if (query->client) ib_sa_client_put(query->client); query->release(query); } static void recv_handler(struct ib_mad_agent *mad_agent, struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_sa_query *query; struct ib_mad *mad; if (!send_buf) return; query = send_buf->context[0]; mad = mad_recv_wc->recv_buf.mad; if (query->rmpp_callback) { if (mad_recv_wc->wc->status == IB_WC_SUCCESS) query->rmpp_callback(query, mad->mad_hdr.status ? -EINVAL : 0, mad_recv_wc); else query->rmpp_callback(query, -EIO, NULL); } else if (query->callback) { if (mad_recv_wc->wc->status == IB_WC_SUCCESS) query->callback(query, mad->mad_hdr.status ? -EINVAL : 0, (struct ib_sa_mad *)mad); else query->callback(query, -EIO, NULL); } ib_free_recv_mad(mad_recv_wc); } static void update_sm_ah(struct work_struct *work) { struct ib_sa_port *port = container_of(work, struct ib_sa_port, update_task); struct ib_sa_sm_ah *new_ah; struct ib_port_attr port_attr; struct rdma_ah_attr ah_attr; bool grh_required; if (ib_query_port(port->agent->device, port->port_num, &port_attr)) { pr_warn("Couldn't query port\n"); return; } new_ah = kmalloc(sizeof(*new_ah), GFP_KERNEL); if (!new_ah) return; kref_init(&new_ah->ref); new_ah->src_path_mask = (1 << port_attr.lmc) - 1; new_ah->pkey_index = 0; if (ib_find_pkey(port->agent->device, port->port_num, IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index)) pr_err("Couldn't find index for default PKey\n"); memset(&ah_attr, 0, sizeof(ah_attr)); ah_attr.type = rdma_ah_find_type(port->agent->device, port->port_num); rdma_ah_set_dlid(&ah_attr, port_attr.sm_lid); rdma_ah_set_sl(&ah_attr, port_attr.sm_sl); rdma_ah_set_port_num(&ah_attr, port->port_num); grh_required = rdma_is_grh_required(port->agent->device, port->port_num); /* * The OPA sm_lid of 0xFFFF needs special handling so that it can be * differentiated from a permissive LID of 0xFFFF. We set the * grh_required flag here so the SA can program the DGID in the * address handle appropriately */ if (ah_attr.type == RDMA_AH_ATTR_TYPE_OPA && (grh_required || port_attr.sm_lid == be16_to_cpu(IB_LID_PERMISSIVE))) rdma_ah_set_make_grd(&ah_attr, true); if (ah_attr.type == RDMA_AH_ATTR_TYPE_IB && grh_required) { rdma_ah_set_ah_flags(&ah_attr, IB_AH_GRH); rdma_ah_set_subnet_prefix(&ah_attr, cpu_to_be64(port_attr.subnet_prefix)); rdma_ah_set_interface_id(&ah_attr, cpu_to_be64(IB_SA_WELL_KNOWN_GUID)); } new_ah->ah = rdma_create_ah(port->agent->qp->pd, &ah_attr, RDMA_CREATE_AH_SLEEPABLE); if (IS_ERR(new_ah->ah)) { pr_warn("Couldn't create new SM AH\n"); kfree(new_ah); return; } spin_lock_irq(&port->ah_lock); if (port->sm_ah) kref_put(&port->sm_ah->ref, free_sm_ah); port->sm_ah = new_ah; spin_unlock_irq(&port->ah_lock); } static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event) { if (event->event == IB_EVENT_PORT_ERR || event->event == IB_EVENT_PORT_ACTIVE || event->event == IB_EVENT_LID_CHANGE || event->event == IB_EVENT_PKEY_CHANGE || event->event == IB_EVENT_SM_CHANGE || event->event == IB_EVENT_CLIENT_REREGISTER) { unsigned long flags; struct ib_sa_device *sa_dev = container_of(handler, typeof(*sa_dev), event_handler); u32 port_num = event->element.port_num - sa_dev->start_port; struct ib_sa_port *port = &sa_dev->port[port_num]; if (!rdma_cap_ib_sa(handler->device, port->port_num)) return; spin_lock_irqsave(&port->ah_lock, flags); if (port->sm_ah) kref_put(&port->sm_ah->ref, free_sm_ah); port->sm_ah = NULL; spin_unlock_irqrestore(&port->ah_lock, flags); if (event->event == IB_EVENT_SM_CHANGE || event->event == IB_EVENT_CLIENT_REREGISTER || event->event == IB_EVENT_LID_CHANGE || event->event == IB_EVENT_PORT_ACTIVE) { unsigned long delay = msecs_to_jiffies(IB_SA_CPI_RETRY_WAIT); spin_lock_irqsave(&port->classport_lock, flags); port->classport_info.valid = false; port->classport_info.retry_cnt = 0; spin_unlock_irqrestore(&port->classport_lock, flags); queue_delayed_work(ib_wq, &port->ib_cpi_work, delay); } queue_work(ib_wq, &sa_dev->port[port_num].update_task); } } static int ib_sa_add_one(struct ib_device *device) { struct ib_sa_device *sa_dev; int s, e, i; int count = 0; int ret; s = rdma_start_port(device); e = rdma_end_port(device); sa_dev = kzalloc(struct_size(sa_dev, port, size_add(size_sub(e, s), 1)), GFP_KERNEL); if (!sa_dev) return -ENOMEM; sa_dev->start_port = s; sa_dev->end_port = e; for (i = 0; i <= e - s; ++i) { spin_lock_init(&sa_dev->port[i].ah_lock); if (!rdma_cap_ib_sa(device, i + 1)) continue; sa_dev->port[i].sm_ah = NULL; sa_dev->port[i].port_num = i + s; spin_lock_init(&sa_dev->port[i].classport_lock); sa_dev->port[i].classport_info.valid = false; sa_dev->port[i].agent = ib_register_mad_agent(device, i + s, IB_QPT_GSI, NULL, IB_MGMT_RMPP_VERSION, send_handler, recv_handler, sa_dev, 0); if (IS_ERR(sa_dev->port[i].agent)) { ret = PTR_ERR(sa_dev->port[i].agent); goto err; } INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah); INIT_DELAYED_WORK(&sa_dev->port[i].ib_cpi_work, update_ib_cpi); count++; } if (!count) { ret = -EOPNOTSUPP; goto free; } ib_set_client_data(device, &sa_client, sa_dev); /* * We register our event handler after everything is set up, * and then update our cached info after the event handler is * registered to avoid any problems if a port changes state * during our initialization. */ INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event); ib_register_event_handler(&sa_dev->event_handler); for (i = 0; i <= e - s; ++i) { if (rdma_cap_ib_sa(device, i + 1)) update_sm_ah(&sa_dev->port[i].update_task); } return 0; err: while (--i >= 0) { if (rdma_cap_ib_sa(device, i + 1)) ib_unregister_mad_agent(sa_dev->port[i].agent); } free: kfree(sa_dev); return ret; } static void ib_sa_remove_one(struct ib_device *device, void *client_data) { struct ib_sa_device *sa_dev = client_data; int i; ib_unregister_event_handler(&sa_dev->event_handler); flush_workqueue(ib_wq); for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) { if (rdma_cap_ib_sa(device, i + 1)) { cancel_delayed_work_sync(&sa_dev->port[i].ib_cpi_work); ib_unregister_mad_agent(sa_dev->port[i].agent); if (sa_dev->port[i].sm_ah) kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah); } } kfree(sa_dev); } int ib_sa_init(void) { int ret; get_random_bytes(&tid, sizeof tid); atomic_set(&ib_nl_sa_request_seq, 0); ret = ib_register_client(&sa_client); if (ret) { pr_err("Couldn't register ib_sa client\n"); goto err1; } ret = mcast_init(); if (ret) { pr_err("Couldn't initialize multicast handling\n"); goto err2; } ib_nl_wq = alloc_ordered_workqueue("ib_nl_sa_wq", WQ_MEM_RECLAIM); if (!ib_nl_wq) { ret = -ENOMEM; goto err3; } INIT_DELAYED_WORK(&ib_nl_timed_work, ib_nl_request_timeout); return 0; err3: mcast_cleanup(); err2: ib_unregister_client(&sa_client); err1: return ret; } void ib_sa_cleanup(void) { cancel_delayed_work(&ib_nl_timed_work); destroy_workqueue(ib_nl_wq); mcast_cleanup(); ib_unregister_client(&sa_client); WARN_ON(!xa_empty(&queries)); }
422 339 422 677 377 324 410 180 251 419 42 42 396 418 421 180 251 421 749 750 608 138 137 138 133 133 138 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 // SPDX-License-Identifier: GPL-2.0-only /* * Network node table * * SELinux must keep a mapping of network nodes to labels/SIDs. This * mapping is maintained as part of the normal policy but a fast cache is * needed to reduce the lookup overhead since most of these queries happen on * a per-packet basis. * * Author: Paul Moore <paul@paul-moore.com> * * This code is heavily based on the "netif" concept originally developed by * James Morris <jmorris@redhat.com> * (see security/selinux/netif.c for more information) */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2007 */ #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <net/ip.h> #include <net/ipv6.h> #include "netnode.h" #include "objsec.h" #define SEL_NETNODE_HASH_SIZE 256 #define SEL_NETNODE_HASH_BKT_LIMIT 16 struct sel_netnode_bkt { unsigned int size; struct list_head list; }; struct sel_netnode { struct netnode_security_struct nsec; struct list_head list; struct rcu_head rcu; }; /* NOTE: we are using a combined hash table for both IPv4 and IPv6, the reason * for this is that I suspect most users will not make heavy use of both * address families at the same time so one table will usually end up wasted, * if this becomes a problem we can always add a hash table for each address * family later */ static DEFINE_SPINLOCK(sel_netnode_lock); static struct sel_netnode_bkt sel_netnode_hash[SEL_NETNODE_HASH_SIZE]; /** * sel_netnode_hashfn_ipv4 - IPv4 hashing function for the node table * @addr: IPv4 address * * Description: * This is the IPv4 hashing function for the node interface table, it returns * the bucket number for the given IP address. * */ static unsigned int sel_netnode_hashfn_ipv4(__be32 addr) { /* at some point we should determine if the mismatch in byte order * affects the hash function dramatically */ return (addr & (SEL_NETNODE_HASH_SIZE - 1)); } /** * sel_netnode_hashfn_ipv6 - IPv6 hashing function for the node table * @addr: IPv6 address * * Description: * This is the IPv6 hashing function for the node interface table, it returns * the bucket number for the given IP address. * */ static unsigned int sel_netnode_hashfn_ipv6(const struct in6_addr *addr) { /* just hash the least significant 32 bits to keep things fast (they * are the most likely to be different anyway), we can revisit this * later if needed */ return (addr->s6_addr32[3] & (SEL_NETNODE_HASH_SIZE - 1)); } /** * sel_netnode_find - Search for a node record * @addr: IP address * @family: address family * * Description: * Search the network node table and return the record matching @addr. If an * entry can not be found in the table return NULL. * */ static struct sel_netnode *sel_netnode_find(const void *addr, u16 family) { unsigned int idx; struct sel_netnode *node; switch (family) { case PF_INET: idx = sel_netnode_hashfn_ipv4(*(const __be32 *)addr); break; case PF_INET6: idx = sel_netnode_hashfn_ipv6(addr); break; default: BUG(); return NULL; } list_for_each_entry_rcu(node, &sel_netnode_hash[idx].list, list) if (node->nsec.family == family) switch (family) { case PF_INET: if (node->nsec.addr.ipv4 == *(const __be32 *)addr) return node; break; case PF_INET6: if (ipv6_addr_equal(&node->nsec.addr.ipv6, addr)) return node; break; } return NULL; } /** * sel_netnode_insert - Insert a new node into the table * @node: the new node record * * Description: * Add a new node record to the network address hash table. * */ static void sel_netnode_insert(struct sel_netnode *node) { unsigned int idx; switch (node->nsec.family) { case PF_INET: idx = sel_netnode_hashfn_ipv4(node->nsec.addr.ipv4); break; case PF_INET6: idx = sel_netnode_hashfn_ipv6(&node->nsec.addr.ipv6); break; default: BUG(); return; } /* we need to impose a limit on the growth of the hash table so check * this bucket to make sure it is within the specified bounds */ list_add_rcu(&node->list, &sel_netnode_hash[idx].list); if (sel_netnode_hash[idx].size == SEL_NETNODE_HASH_BKT_LIMIT) { struct sel_netnode *tail; tail = list_entry( rcu_dereference_protected( list_tail_rcu(&sel_netnode_hash[idx].list), lockdep_is_held(&sel_netnode_lock)), struct sel_netnode, list); list_del_rcu(&tail->list); kfree_rcu(tail, rcu); } else sel_netnode_hash[idx].size++; } /** * sel_netnode_sid_slow - Lookup the SID of a network address using the policy * @addr: the IP address * @family: the address family * @sid: node SID * * Description: * This function determines the SID of a network address by querying the * security policy. The result is added to the network address table to * speedup future queries. Returns zero on success, negative values on * failure. * */ static int sel_netnode_sid_slow(const void *addr, u16 family, u32 *sid) { int ret; struct sel_netnode *node; struct sel_netnode *new; spin_lock_bh(&sel_netnode_lock); node = sel_netnode_find(addr, family); if (node != NULL) { *sid = node->nsec.sid; spin_unlock_bh(&sel_netnode_lock); return 0; } /* If this memory allocation fails still return 0. The SID * is valid, it just won't be added to the cache. */ new = kmalloc(sizeof(*new), GFP_ATOMIC); switch (family) { case PF_INET: ret = security_node_sid(PF_INET, addr, sizeof(struct in_addr), sid); if (new) new->nsec.addr.ipv4 = *(const __be32 *)addr; break; case PF_INET6: ret = security_node_sid(PF_INET6, addr, sizeof(struct in6_addr), sid); if (new) new->nsec.addr.ipv6 = *(const struct in6_addr *)addr; break; default: BUG(); ret = -EINVAL; } if (ret == 0 && new) { new->nsec.family = family; new->nsec.sid = *sid; sel_netnode_insert(new); } else kfree(new); spin_unlock_bh(&sel_netnode_lock); if (unlikely(ret)) pr_warn("SELinux: failure in %s(), unable to determine network node label\n", __func__); return ret; } /** * sel_netnode_sid - Lookup the SID of a network address * @addr: the IP address * @family: the address family * @sid: node SID * * Description: * This function determines the SID of a network address using the fastest * method possible. First the address table is queried, but if an entry * can't be found then the policy is queried and the result is added to the * table to speedup future queries. Returns zero on success, negative values * on failure. * */ int sel_netnode_sid(const void *addr, u16 family, u32 *sid) { struct sel_netnode *node; rcu_read_lock(); node = sel_netnode_find(addr, family); if (likely(node != NULL)) { *sid = node->nsec.sid; rcu_read_unlock(); return 0; } rcu_read_unlock(); return sel_netnode_sid_slow(addr, family, sid); } /** * sel_netnode_flush - Flush the entire network address table * * Description: * Remove all entries from the network address table. * */ void sel_netnode_flush(void) { unsigned int idx; struct sel_netnode *node, *node_tmp; spin_lock_bh(&sel_netnode_lock); for (idx = 0; idx < SEL_NETNODE_HASH_SIZE; idx++) { list_for_each_entry_safe(node, node_tmp, &sel_netnode_hash[idx].list, list) { list_del_rcu(&node->list); kfree_rcu(node, rcu); } sel_netnode_hash[idx].size = 0; } spin_unlock_bh(&sel_netnode_lock); } static __init int sel_netnode_init(void) { int iter; if (!selinux_enabled_boot) return 0; for (iter = 0; iter < SEL_NETNODE_HASH_SIZE; iter++) { INIT_LIST_HEAD(&sel_netnode_hash[iter].list); sel_netnode_hash[iter].size = 0; } return 0; } __initcall(sel_netnode_init);
132 130 433 434 273 273 324 13 13 6 2 16 16 6 5 6 13 13 415 504 417 13 416 192 284 811 264 417 285 812 813 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/kernel.h> #include <linux/slab.h> #include <net/act_api.h> #include <net/flow_offload.h> #include <linux/rtnetlink.h> #include <linux/mutex.h> #include <linux/rhashtable.h> struct flow_rule *flow_rule_alloc(unsigned int num_actions) { struct flow_rule *rule; int i; rule = kzalloc(struct_size(rule, action.entries, num_actions), GFP_KERNEL); if (!rule) return NULL; rule->action.num_entries = num_actions; /* Pre-fill each action hw_stats with DONT_CARE. * Caller can override this if it wants stats for a given action. */ for (i = 0; i < num_actions; i++) rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; return rule; } EXPORT_SYMBOL(flow_rule_alloc); struct flow_offload_action *offload_action_alloc(unsigned int num_actions) { struct flow_offload_action *fl_action; int i; fl_action = kzalloc(struct_size(fl_action, action.entries, num_actions), GFP_KERNEL); if (!fl_action) return NULL; fl_action->action.num_entries = num_actions; /* Pre-fill each action hw_stats with DONT_CARE. * Caller can override this if it wants stats for a given action. */ for (i = 0; i < num_actions; i++) fl_action->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; return fl_action; } #define FLOW_DISSECTOR_MATCH(__rule, __type, __out) \ const struct flow_match *__m = &(__rule)->match; \ struct flow_dissector *__d = (__m)->dissector; \ \ (__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \ (__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \ void flow_rule_match_meta(const struct flow_rule *rule, struct flow_match_meta *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out); } EXPORT_SYMBOL(flow_rule_match_meta); void flow_rule_match_basic(const struct flow_rule *rule, struct flow_match_basic *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_BASIC, out); } EXPORT_SYMBOL(flow_rule_match_basic); void flow_rule_match_control(const struct flow_rule *rule, struct flow_match_control *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CONTROL, out); } EXPORT_SYMBOL(flow_rule_match_control); void flow_rule_match_eth_addrs(const struct flow_rule *rule, struct flow_match_eth_addrs *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ETH_ADDRS, out); } EXPORT_SYMBOL(flow_rule_match_eth_addrs); void flow_rule_match_vlan(const struct flow_rule *rule, struct flow_match_vlan *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_VLAN, out); } EXPORT_SYMBOL(flow_rule_match_vlan); void flow_rule_match_cvlan(const struct flow_rule *rule, struct flow_match_vlan *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CVLAN, out); } EXPORT_SYMBOL(flow_rule_match_cvlan); void flow_rule_match_arp(const struct flow_rule *rule, struct flow_match_arp *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ARP, out); } EXPORT_SYMBOL(flow_rule_match_arp); void flow_rule_match_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV4_ADDRS, out); } EXPORT_SYMBOL(flow_rule_match_ipv4_addrs); void flow_rule_match_ipv6_addrs(const struct flow_rule *rule, struct flow_match_ipv6_addrs *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPV6_ADDRS, out); } EXPORT_SYMBOL(flow_rule_match_ipv6_addrs); void flow_rule_match_ip(const struct flow_rule *rule, struct flow_match_ip *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IP, out); } EXPORT_SYMBOL(flow_rule_match_ip); void flow_rule_match_ports(const struct flow_rule *rule, struct flow_match_ports *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS, out); } EXPORT_SYMBOL(flow_rule_match_ports); void flow_rule_match_ports_range(const struct flow_rule *rule, struct flow_match_ports_range *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PORTS_RANGE, out); } EXPORT_SYMBOL(flow_rule_match_ports_range); void flow_rule_match_tcp(const struct flow_rule *rule, struct flow_match_tcp *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_TCP, out); } EXPORT_SYMBOL(flow_rule_match_tcp); void flow_rule_match_ipsec(const struct flow_rule *rule, struct flow_match_ipsec *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_IPSEC, out); } EXPORT_SYMBOL(flow_rule_match_ipsec); void flow_rule_match_icmp(const struct flow_rule *rule, struct flow_match_icmp *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ICMP, out); } EXPORT_SYMBOL(flow_rule_match_icmp); void flow_rule_match_mpls(const struct flow_rule *rule, struct flow_match_mpls *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_MPLS, out); } EXPORT_SYMBOL(flow_rule_match_mpls); void flow_rule_match_enc_control(const struct flow_rule *rule, struct flow_match_control *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_CONTROL, out); } EXPORT_SYMBOL(flow_rule_match_enc_control); void flow_rule_match_enc_ipv4_addrs(const struct flow_rule *rule, struct flow_match_ipv4_addrs *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, out); } EXPORT_SYMBOL(flow_rule_match_enc_ipv4_addrs); void flow_rule_match_enc_ipv6_addrs(const struct flow_rule *rule, struct flow_match_ipv6_addrs *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, out); } EXPORT_SYMBOL(flow_rule_match_enc_ipv6_addrs); void flow_rule_match_enc_ip(const struct flow_rule *rule, struct flow_match_ip *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_IP, out); } EXPORT_SYMBOL(flow_rule_match_enc_ip); void flow_rule_match_enc_ports(const struct flow_rule *rule, struct flow_match_ports *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_PORTS, out); } EXPORT_SYMBOL(flow_rule_match_enc_ports); void flow_rule_match_enc_keyid(const struct flow_rule *rule, struct flow_match_enc_keyid *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_KEYID, out); } EXPORT_SYMBOL(flow_rule_match_enc_keyid); void flow_rule_match_enc_opts(const struct flow_rule *rule, struct flow_match_enc_opts *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_OPTS, out); } EXPORT_SYMBOL(flow_rule_match_enc_opts); struct flow_action_cookie *flow_action_cookie_create(void *data, unsigned int len, gfp_t gfp) { struct flow_action_cookie *cookie; cookie = kmalloc(sizeof(*cookie) + len, gfp); if (!cookie) return NULL; cookie->cookie_len = len; memcpy(cookie->cookie, data, len); return cookie; } EXPORT_SYMBOL(flow_action_cookie_create); void flow_action_cookie_destroy(struct flow_action_cookie *cookie) { kfree(cookie); } EXPORT_SYMBOL(flow_action_cookie_destroy); void flow_rule_match_ct(const struct flow_rule *rule, struct flow_match_ct *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_CT, out); } EXPORT_SYMBOL(flow_rule_match_ct); void flow_rule_match_pppoe(const struct flow_rule *rule, struct flow_match_pppoe *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_PPPOE, out); } EXPORT_SYMBOL(flow_rule_match_pppoe); void flow_rule_match_l2tpv3(const struct flow_rule *rule, struct flow_match_l2tpv3 *out) { FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_L2TPV3, out); } EXPORT_SYMBOL(flow_rule_match_l2tpv3); struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)) { struct flow_block_cb *block_cb; block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL); if (!block_cb) return ERR_PTR(-ENOMEM); block_cb->cb = cb; block_cb->cb_ident = cb_ident; block_cb->cb_priv = cb_priv; block_cb->release = release; return block_cb; } EXPORT_SYMBOL(flow_block_cb_alloc); void flow_block_cb_free(struct flow_block_cb *block_cb) { if (block_cb->release) block_cb->release(block_cb->cb_priv); kfree(block_cb); } EXPORT_SYMBOL(flow_block_cb_free); struct flow_block_cb *flow_block_cb_lookup(struct flow_block *block, flow_setup_cb_t *cb, void *cb_ident) { struct flow_block_cb *block_cb; list_for_each_entry(block_cb, &block->cb_list, list) { if (block_cb->cb == cb && block_cb->cb_ident == cb_ident) return block_cb; } return NULL; } EXPORT_SYMBOL(flow_block_cb_lookup); void *flow_block_cb_priv(struct flow_block_cb *block_cb) { return block_cb->cb_priv; } EXPORT_SYMBOL(flow_block_cb_priv); void flow_block_cb_incref(struct flow_block_cb *block_cb) { block_cb->refcnt++; } EXPORT_SYMBOL(flow_block_cb_incref); unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb) { return --block_cb->refcnt; } EXPORT_SYMBOL(flow_block_cb_decref); bool flow_block_cb_is_busy(flow_setup_cb_t *cb, void *cb_ident, struct list_head *driver_block_list) { struct flow_block_cb *block_cb; list_for_each_entry(block_cb, driver_block_list, driver_list) { if (block_cb->cb == cb && block_cb->cb_ident == cb_ident) return true; } return false; } EXPORT_SYMBOL(flow_block_cb_is_busy); int flow_block_cb_setup_simple(struct flow_block_offload *f, struct list_head *driver_block_list, flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, bool ingress_only) { struct flow_block_cb *block_cb; if (ingress_only && f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS) return -EOPNOTSUPP; f->driver_block_list = driver_block_list; switch (f->command) { case FLOW_BLOCK_BIND: if (flow_block_cb_is_busy(cb, cb_ident, driver_block_list)) return -EBUSY; block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, NULL); if (IS_ERR(block_cb)) return PTR_ERR(block_cb); flow_block_cb_add(block_cb, f); list_add_tail(&block_cb->driver_list, driver_block_list); return 0; case FLOW_BLOCK_UNBIND: block_cb = flow_block_cb_lookup(f->block, cb, cb_ident); if (!block_cb) return -ENOENT; flow_block_cb_remove(block_cb, f); list_del(&block_cb->driver_list); return 0; default: return -EOPNOTSUPP; } } EXPORT_SYMBOL(flow_block_cb_setup_simple); static DEFINE_MUTEX(flow_indr_block_lock); static LIST_HEAD(flow_block_indr_list); static LIST_HEAD(flow_block_indr_dev_list); static LIST_HEAD(flow_indir_dev_list); struct flow_indr_dev { struct list_head list; flow_indr_block_bind_cb_t *cb; void *cb_priv; refcount_t refcnt; }; static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, void *cb_priv) { struct flow_indr_dev *indr_dev; indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL); if (!indr_dev) return NULL; indr_dev->cb = cb; indr_dev->cb_priv = cb_priv; refcount_set(&indr_dev->refcnt, 1); return indr_dev; } struct flow_indir_dev_info { void *data; struct net_device *dev; struct Qdisc *sch; enum tc_setup_type type; void (*cleanup)(struct flow_block_cb *block_cb); struct list_head list; enum flow_block_command command; enum flow_block_binder_type binder_type; struct list_head *cb_list; }; static void existing_qdiscs_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) { struct flow_block_offload bo; struct flow_indir_dev_info *cur; list_for_each_entry(cur, &flow_indir_dev_list, list) { memset(&bo, 0, sizeof(bo)); bo.command = cur->command; bo.binder_type = cur->binder_type; INIT_LIST_HEAD(&bo.cb_list); cb(cur->dev, cur->sch, cb_priv, cur->type, &bo, cur->data, cur->cleanup); list_splice(&bo.cb_list, cur->cb_list); } } int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) { struct flow_indr_dev *indr_dev; mutex_lock(&flow_indr_block_lock); list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) { if (indr_dev->cb == cb && indr_dev->cb_priv == cb_priv) { refcount_inc(&indr_dev->refcnt); mutex_unlock(&flow_indr_block_lock); return 0; } } indr_dev = flow_indr_dev_alloc(cb, cb_priv); if (!indr_dev) { mutex_unlock(&flow_indr_block_lock); return -ENOMEM; } list_add(&indr_dev->list, &flow_block_indr_dev_list); existing_qdiscs_register(cb, cb_priv); mutex_unlock(&flow_indr_block_lock); tcf_action_reoffload_cb(cb, cb_priv, true); return 0; } EXPORT_SYMBOL(flow_indr_dev_register); static void __flow_block_indr_cleanup(void (*release)(void *cb_priv), void *cb_priv, struct list_head *cleanup_list) { struct flow_block_cb *this, *next; list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) { if (this->release == release && this->indr.cb_priv == cb_priv) list_move(&this->indr.list, cleanup_list); } } static void flow_block_indr_notify(struct list_head *cleanup_list) { struct flow_block_cb *this, *next; list_for_each_entry_safe(this, next, cleanup_list, indr.list) { list_del(&this->indr.list); this->indr.cleanup(this); } } void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, void (*release)(void *cb_priv)) { struct flow_indr_dev *this, *next, *indr_dev = NULL; LIST_HEAD(cleanup_list); mutex_lock(&flow_indr_block_lock); list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) { if (this->cb == cb && this->cb_priv == cb_priv && refcount_dec_and_test(&this->refcnt)) { indr_dev = this; list_del(&indr_dev->list); break; } } if (!indr_dev) { mutex_unlock(&flow_indr_block_lock); return; } __flow_block_indr_cleanup(release, cb_priv, &cleanup_list); mutex_unlock(&flow_indr_block_lock); tcf_action_reoffload_cb(cb, cb_priv, false); flow_block_indr_notify(&cleanup_list); kfree(indr_dev); } EXPORT_SYMBOL(flow_indr_dev_unregister); static void flow_block_indr_init(struct flow_block_cb *flow_block, struct flow_block_offload *bo, struct net_device *dev, struct Qdisc *sch, void *data, void *cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)) { flow_block->indr.binder_type = bo->binder_type; flow_block->indr.data = data; flow_block->indr.cb_priv = cb_priv; flow_block->indr.dev = dev; flow_block->indr.sch = sch; flow_block->indr.cleanup = cleanup; } struct flow_block_cb *flow_indr_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv), struct flow_block_offload *bo, struct net_device *dev, struct Qdisc *sch, void *data, void *indr_cb_priv, void (*cleanup)(struct flow_block_cb *block_cb)) { struct flow_block_cb *block_cb; block_cb = flow_block_cb_alloc(cb, cb_ident, cb_priv, release); if (IS_ERR(block_cb)) goto out; flow_block_indr_init(block_cb, bo, dev, sch, data, indr_cb_priv, cleanup); list_add(&block_cb->indr.list, &flow_block_indr_list); out: return block_cb; } EXPORT_SYMBOL(flow_indr_block_cb_alloc); static struct flow_indir_dev_info *find_indir_dev(void *data) { struct flow_indir_dev_info *cur; list_for_each_entry(cur, &flow_indir_dev_list, list) { if (cur->data == data) return cur; } return NULL; } static int indir_dev_add(void *data, struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void (*cleanup)(struct flow_block_cb *block_cb), struct flow_block_offload *bo) { struct flow_indir_dev_info *info; info = find_indir_dev(data); if (info) return -EEXIST; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; info->data = data; info->dev = dev; info->sch = sch; info->type = type; info->cleanup = cleanup; info->command = bo->command; info->binder_type = bo->binder_type; info->cb_list = bo->cb_list_head; list_add(&info->list, &flow_indir_dev_list); return 0; } static int indir_dev_remove(void *data) { struct flow_indir_dev_info *info; info = find_indir_dev(data); if (!info) return -ENOENT; list_del(&info->list); kfree(info); return 0; } int flow_indr_dev_setup_offload(struct net_device *dev, struct Qdisc *sch, enum tc_setup_type type, void *data, struct flow_block_offload *bo, void (*cleanup)(struct flow_block_cb *block_cb)) { struct flow_indr_dev *this; u32 count = 0; int err; mutex_lock(&flow_indr_block_lock); if (bo) { if (bo->command == FLOW_BLOCK_BIND) indir_dev_add(data, dev, sch, type, cleanup, bo); else if (bo->command == FLOW_BLOCK_UNBIND) indir_dev_remove(data); } list_for_each_entry(this, &flow_block_indr_dev_list, list) { err = this->cb(dev, sch, this->cb_priv, type, bo, data, cleanup); if (!err) count++; } mutex_unlock(&flow_indr_block_lock); return (bo && list_empty(&bo->cb_list)) ? -EOPNOTSUPP : count; } EXPORT_SYMBOL(flow_indr_dev_setup_offload); bool flow_indr_dev_exists(void) { return !list_empty(&flow_block_indr_dev_list); } EXPORT_SYMBOL(flow_indr_dev_exists);
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 // SPDX-License-Identifier: GPL-2.0-only /* * IPV4 GSO/GRO offload support * Linux INET implementation * * Copyright (C) 2016 secunet Security Networks AG * Author: Steffen Klassert <steffen.klassert@secunet.com> * * ESP GRO support */ #include <linux/skbuff.h> #include <linux/init.h> #include <net/protocol.h> #include <crypto/aead.h> #include <crypto/authenc.h> #include <linux/err.h> #include <linux/module.h> #include <net/gro.h> #include <net/gso.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/esp.h> #include <linux/scatterlist.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/udp.h> static struct sk_buff *esp4_gro_receive(struct list_head *head, struct sk_buff *skb) { int offset = skb_gro_offset(skb); struct xfrm_offload *xo; struct xfrm_state *x; int encap_type = 0; __be32 seq; __be32 spi; if (!pskb_pull(skb, offset)) return NULL; if (xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq) != 0) goto out; xo = xfrm_offload(skb); if (!xo || !(xo->flags & CRYPTO_DONE)) { struct sec_path *sp = secpath_set(skb); if (!sp) goto out; if (sp->len == XFRM_MAX_DEPTH) goto out_reset; x = xfrm_input_state_lookup(dev_net(skb->dev), skb->mark, (xfrm_address_t *)&ip_hdr(skb)->daddr, spi, IPPROTO_ESP, AF_INET); if (unlikely(x && x->dir && x->dir != XFRM_SA_DIR_IN)) { /* non-offload path will record the error and audit log */ xfrm_state_put(x); x = NULL; } if (!x) goto out_reset; skb->mark = xfrm_smark_get(skb->mark, x); sp->xvec[sp->len++] = x; sp->olen++; xo = xfrm_offload(skb); if (!xo) goto out_reset; } xo->flags |= XFRM_GRO; if (NAPI_GRO_CB(skb)->proto == IPPROTO_UDP) encap_type = UDP_ENCAP_ESPINUDP; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; XFRM_SPI_SKB_CB(skb)->family = AF_INET; XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct iphdr, daddr); XFRM_SPI_SKB_CB(skb)->seq = seq; /* We don't need to handle errors from xfrm_input, it does all * the error handling and frees the resources on error. */ xfrm_input(skb, IPPROTO_ESP, spi, encap_type); return ERR_PTR(-EINPROGRESS); out_reset: secpath_reset(skb); out: skb_push(skb, offset); NAPI_GRO_CB(skb)->same_flow = 0; NAPI_GRO_CB(skb)->flush = 1; return NULL; } static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) { struct ip_esp_hdr *esph; struct iphdr *iph = ip_hdr(skb); struct xfrm_offload *xo = xfrm_offload(skb); int proto = iph->protocol; skb_push(skb, -skb_network_offset(skb)); esph = ip_esp_hdr(skb); *skb_mac_header(skb) = IPPROTO_ESP; esph->spi = x->id.spi; esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); xo->proto = proto; } static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { __be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6) : htons(ETH_P_IP); return skb_eth_gso_segment(skb, features, type); } static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { const struct net_offload *ops; struct sk_buff *segs = ERR_PTR(-EINVAL); struct xfrm_offload *xo = xfrm_offload(skb); skb->transport_header += x->props.header_len; ops = rcu_dereference(inet_offloads[xo->proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm4_beet_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { struct xfrm_offload *xo = xfrm_offload(skb); struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; u8 proto = xo->proto; skb->transport_header += x->props.header_len; if (x->sel.family != AF_INET6) { if (proto == IPPROTO_BEETPH) { struct ip_beet_phdr *ph = (struct ip_beet_phdr *)skb->data; skb->transport_header += ph->hdrlen * 8; proto = ph->nexthdr; } else { skb->transport_header -= IPV4_BEET_PHMAXLEN; } } else { __be16 frag; skb->transport_header += ipv6_skip_exthdr(skb, 0, &proto, &frag); if (proto == IPPROTO_TCP) skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4; } if (proto == IPPROTO_IPV6) skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; __skb_pull(skb, skb_transport_offset(skb)); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) segs = ops->callbacks.gso_segment(skb, features); return segs; } static struct sk_buff *xfrm4_outer_mode_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { switch (x->outer_mode.encap) { case XFRM_MODE_TUNNEL: return xfrm4_tunnel_gso_segment(x, skb, features); case XFRM_MODE_TRANSPORT: return xfrm4_transport_gso_segment(x, skb, features); case XFRM_MODE_BEET: return xfrm4_beet_gso_segment(x, skb, features); } return ERR_PTR(-EOPNOTSUPP); } static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, netdev_features_t features) { struct xfrm_state *x; struct ip_esp_hdr *esph; struct crypto_aead *aead; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); struct sec_path *sp; if (!xo) return ERR_PTR(-EINVAL); if (!(skb_shinfo(skb)->gso_type & SKB_GSO_ESP)) return ERR_PTR(-EINVAL); sp = skb_sec_path(skb); x = sp->xvec[sp->len - 1]; aead = x->data; esph = ip_esp_hdr(skb); if (esph->spi != x->id.spi) return ERR_PTR(-EINVAL); if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) return ERR_PTR(-EINVAL); __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); skb->encap_hdr_csum = 1; if ((!(skb->dev->gso_partial_features & NETIF_F_HW_ESP) && !(features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); else if (!(features & NETIF_F_HW_ESP_TX_CSUM) && !(skb->dev->gso_partial_features & NETIF_F_HW_ESP_TX_CSUM)) esp_features = features & ~(NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC); xo->flags |= XFRM_GSO_SEGMENT; return xfrm4_outer_mode_gso_segment(x, skb, esp_features); } static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) { struct crypto_aead *aead = x->data; struct xfrm_offload *xo = xfrm_offload(skb); if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead))) return -EINVAL; if (!(xo->flags & CRYPTO_DONE)) skb->ip_summed = CHECKSUM_NONE; return esp_input_done2(skb, 0); } static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { int err; int alen; int blksize; struct xfrm_offload *xo; struct ip_esp_hdr *esph; struct crypto_aead *aead; struct esp_info esp; bool hw_offload = true; __u32 seq; int encap_type = 0; esp.inplace = true; xo = xfrm_offload(skb); if (!xo) return -EINVAL; if ((!(features & NETIF_F_HW_ESP) && !(skb->dev->gso_partial_features & NETIF_F_HW_ESP)) || x->xso.dev != skb->dev) { xo->flags |= CRYPTO_FALLBACK; hw_offload = false; } esp.proto = xo->proto; /* skb is pure payload to encrypt */ aead = x->data; alen = crypto_aead_authsize(aead); esp.tfclen = 0; /* XXX: Add support for tfc padding here. */ blksize = ALIGN(crypto_aead_blocksize(aead), 4); esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); esp.plen = esp.clen - skb->len - esp.tfclen; esp.tailen = esp.tfclen + esp.plen + alen; esp.esph = ip_esp_hdr(skb); if (x->encap) encap_type = x->encap->encap_type; if (!hw_offload || !skb_is_gso(skb) || (hw_offload && encap_type == UDP_ENCAP_ESPINUDP)) { esp.nfrags = esp_output_head(x, skb, &esp); if (esp.nfrags < 0) return esp.nfrags; } seq = xo->seq.low; esph = esp.esph; esph->spi = x->id.spi; skb_push(skb, -skb_network_offset(skb)); if (xo->flags & XFRM_GSO_SEGMENT) { esph->seq_no = htonl(seq); if (!skb_is_gso(skb)) xo->seq.low++; else xo->seq.low += skb_shinfo(skb)->gso_segs; } if (xo->seq.low < seq) xo->seq.hi++; esp.seqno = cpu_to_be64(seq + ((u64)xo->seq.hi << 32)); if (hw_offload && encap_type == UDP_ENCAP_ESPINUDP) { /* In the XFRM stack, the encapsulation protocol is set to iphdr->protocol by * setting *skb_mac_header(skb) (see esp_output_udp_encap()) where skb->mac_header * points to iphdr->protocol (see xfrm4_tunnel_encap_add()). * However, in esp_xmit(), skb->mac_header doesn't point to iphdr->protocol. * Therefore, the protocol field needs to be corrected. */ ip_hdr(skb)->protocol = IPPROTO_UDP; esph->seq_no = htonl(seq); } ip_hdr(skb)->tot_len = htons(skb->len); ip_send_check(ip_hdr(skb)); if (hw_offload) { if (!skb_ext_add(skb, SKB_EXT_SEC_PATH)) return -ENOMEM; xo = xfrm_offload(skb); if (!xo) return -EINVAL; xo->flags |= XFRM_XMIT; return 0; } err = esp_output_tail(x, skb, &esp); if (err) return err; secpath_reset(skb); if (skb_needs_linearize(skb, skb->dev->features) && __skb_linearize(skb)) return -ENOMEM; return 0; } static const struct net_offload esp4_offload = { .callbacks = { .gro_receive = esp4_gro_receive, .gso_segment = esp4_gso_segment, }, }; static const struct xfrm_type_offload esp_type_offload = { .owner = THIS_MODULE, .proto = IPPROTO_ESP, .input_tail = esp_input_tail, .xmit = esp_xmit, .encap = esp4_gso_encap, }; static int __init esp4_offload_init(void) { if (xfrm_register_type_offload(&esp_type_offload, AF_INET) < 0) { pr_info("%s: can't add xfrm type offload\n", __func__); return -EAGAIN; } return inet_add_offload(&esp4_offload, IPPROTO_ESP); } static void __exit esp4_offload_exit(void) { xfrm_unregister_type_offload(&esp_type_offload, AF_INET); inet_del_offload(&esp4_offload, IPPROTO_ESP); } module_init(esp4_offload_init); module_exit(esp4_offload_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>"); MODULE_ALIAS_XFRM_OFFLOAD_TYPE(AF_INET, XFRM_PROTO_ESP); MODULE_DESCRIPTION("IPV4 GSO/GRO offload support");
29 29 29 29 29 29 29 29 29 29 29 28 28 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 // SPDX-License-Identifier: GPL-2.0-only /* * * Authors: * Alexander Aring <aar@pengutronix.de> * * Based on: net/mac80211/util.c */ #include "ieee802154_i.h" #include "driver-ops.h" /* privid for wpan_phys to determine whether they belong to us or not */ const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid; /** * ieee802154_wake_queue - wake ieee802154 queue * @hw: main hardware object * * Tranceivers usually have either one transmit framebuffer or one framebuffer * for both transmitting and receiving. Hence, the core currently only handles * one frame at a time for each phy, which means we had to stop the queue to * avoid new skb to come during the transmission. The queue then needs to be * woken up after the operation. */ static void ieee802154_wake_queue(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_sub_if_data *sdata; rcu_read_lock(); clear_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; netif_wake_queue(sdata->dev); } rcu_read_unlock(); } /** * ieee802154_stop_queue - stop ieee802154 queue * @hw: main hardware object * * Tranceivers usually have either one transmit framebuffer or one framebuffer * for both transmitting and receiving. Hence, the core currently only handles * one frame at a time for each phy, which means we need to tell upper layers to * stop giving us new skbs while we are busy with the transmitted one. The queue * must then be stopped before transmitting. */ static void ieee802154_stop_queue(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_sub_if_data *sdata; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; netif_stop_queue(sdata->dev); } rcu_read_unlock(); } void ieee802154_hold_queue(struct ieee802154_local *local) { unsigned long flags; spin_lock_irqsave(&local->phy->queue_lock, flags); if (!atomic_fetch_inc(&local->phy->hold_txs)) ieee802154_stop_queue(&local->hw); spin_unlock_irqrestore(&local->phy->queue_lock, flags); } void ieee802154_release_queue(struct ieee802154_local *local) { unsigned long flags; spin_lock_irqsave(&local->phy->queue_lock, flags); if (atomic_dec_and_test(&local->phy->hold_txs)) ieee802154_wake_queue(&local->hw); spin_unlock_irqrestore(&local->phy->queue_lock, flags); } void ieee802154_disable_queue(struct ieee802154_local *local) { struct ieee802154_sub_if_data *sdata; rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; netif_tx_disable(sdata->dev); } rcu_read_unlock(); } enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer) { struct ieee802154_local *local = container_of(timer, struct ieee802154_local, ifs_timer); ieee802154_release_queue(local); return HRTIMER_NORESTART; } void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, bool ifs_handling) { struct ieee802154_local *local = hw_to_local(hw); local->tx_result = IEEE802154_SUCCESS; if (ifs_handling) { u8 max_sifs_size; /* If transceiver sets CRC on his own we need to use lifs * threshold len above 16 otherwise 18, because it's not * part of skb->len. */ if (hw->flags & IEEE802154_HW_TX_OMIT_CKSUM) max_sifs_size = IEEE802154_MAX_SIFS_FRAME_SIZE - IEEE802154_FCS_LEN; else max_sifs_size = IEEE802154_MAX_SIFS_FRAME_SIZE; if (skb->len > max_sifs_size) hrtimer_start(&local->ifs_timer, hw->phy->lifs_period * NSEC_PER_USEC, HRTIMER_MODE_REL); else hrtimer_start(&local->ifs_timer, hw->phy->sifs_period * NSEC_PER_USEC, HRTIMER_MODE_REL); } else { ieee802154_release_queue(local); } dev_consume_skb_any(skb); if (atomic_dec_and_test(&hw->phy->ongoing_txs)) wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_complete); void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb, int reason) { struct ieee802154_local *local = hw_to_local(hw); local->tx_result = reason; ieee802154_release_queue(local); dev_kfree_skb_any(skb); if (atomic_dec_and_test(&hw->phy->ongoing_txs)) wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_error); void ieee802154_xmit_hw_error(struct ieee802154_hw *hw, struct sk_buff *skb) { ieee802154_xmit_error(hw, skb, IEEE802154_SYSTEM_ERROR); } EXPORT_SYMBOL(ieee802154_xmit_hw_error); void ieee802154_stop_device(struct ieee802154_local *local) { flush_workqueue(local->workqueue); hrtimer_cancel(&local->ifs_timer); drv_stop(local); }
1 1 2 2 2 2 101 101 101 99 99 61 61 1 1 2 1 1 1 1 1 2 1 1 25 1 1 24 4 1 1 2 7 3 3 1 2 3 3 2 1 1 4 3 1 1 1 5 1 2 1 1 4 2 1 1 15 3 12 1 1 12 2 1 1 1 2 1 1 1 9 4 3 2 4 2 4 1 3 1 2 2 2 2 4 1 1 2 3 2 1 4 2 1 1 4 2 1 1 1 5 1 1 1 1 1 4 1 2 1 613 268 353 99 58 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Instituto Nokia de Tecnologia * * Authors: * Lauro Ramos Venancio <lauro.venancio@openbossa.org> * Aloisio Almeida Jr <aloisio.almeida@openbossa.org> * * Vendor commands implementation based on net/wireless/nl80211.c * which is: * * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH */ #define pr_fmt(fmt) KBUILD_MODNAME ": %s: " fmt, __func__ #include <net/genetlink.h> #include <linux/nfc.h> #include <linux/slab.h> #include "nfc.h" #include "llcp.h" static const struct genl_multicast_group nfc_genl_mcgrps[] = { { .name = NFC_GENL_MCAST_EVENT_NAME, }, }; static struct genl_family nfc_genl_family; static const struct nla_policy nfc_genl_policy[NFC_ATTR_MAX + 1] = { [NFC_ATTR_DEVICE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_DEVICE_NAME] = { .type = NLA_STRING, .len = NFC_DEVICE_NAME_MAXSIZE }, [NFC_ATTR_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TARGET_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_COMM_MODE] = { .type = NLA_U8 }, [NFC_ATTR_RF_MODE] = { .type = NLA_U8 }, [NFC_ATTR_DEVICE_POWERED] = { .type = NLA_U8 }, [NFC_ATTR_IM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_TM_PROTOCOLS] = { .type = NLA_U32 }, [NFC_ATTR_LLC_PARAM_LTO] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_RW] = { .type = NLA_U8 }, [NFC_ATTR_LLC_PARAM_MIUX] = { .type = NLA_U16 }, [NFC_ATTR_LLC_SDP] = { .type = NLA_NESTED }, [NFC_ATTR_FIRMWARE_NAME] = { .type = NLA_STRING, .len = NFC_FIRMWARE_NAME_MAXSIZE }, [NFC_ATTR_SE_INDEX] = { .type = NLA_U32 }, [NFC_ATTR_SE_APDU] = { .type = NLA_BINARY }, [NFC_ATTR_VENDOR_ID] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_SUBCMD] = { .type = NLA_U32 }, [NFC_ATTR_VENDOR_DATA] = { .type = NLA_BINARY }, }; static const struct nla_policy nfc_sdp_genl_policy[NFC_SDP_ATTR_MAX + 1] = { [NFC_SDP_ATTR_URI] = { .type = NLA_STRING, .len = U8_MAX - 4 }, [NFC_SDP_ATTR_SAP] = { .type = NLA_U8 }, }; static int nfc_genl_send_target(struct sk_buff *msg, struct nfc_target *target, struct netlink_callback *cb, int flags) { void *hdr; hdr = genlmsg_put(msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &nfc_genl_family, flags, NFC_CMD_GET_TARGET); if (!hdr) return -EMSGSIZE; genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, target->supported_protocols) || nla_put_u16(msg, NFC_ATTR_TARGET_SENS_RES, target->sens_res) || nla_put_u8(msg, NFC_ATTR_TARGET_SEL_RES, target->sel_res)) goto nla_put_failure; if (target->nfcid1_len > 0 && nla_put(msg, NFC_ATTR_TARGET_NFCID1, target->nfcid1_len, target->nfcid1)) goto nla_put_failure; if (target->sensb_res_len > 0 && nla_put(msg, NFC_ATTR_TARGET_SENSB_RES, target->sensb_res_len, target->sensb_res)) goto nla_put_failure; if (target->sensf_res_len > 0 && nla_put(msg, NFC_ATTR_TARGET_SENSF_RES, target->sensf_res_len, target->sensf_res)) goto nla_put_failure; if (target->is_iso15693) { if (nla_put_u8(msg, NFC_ATTR_TARGET_ISO15693_DSFID, target->iso15693_dsfid) || nla_put(msg, NFC_ATTR_TARGET_ISO15693_UID, sizeof(target->iso15693_uid), target->iso15693_uid)) goto nla_put_failure; } if (target->ats_len > 0 && nla_put(msg, NFC_ATTR_TARGET_ATS, target->ats_len, target->ats)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct nfc_dev *dev; u32 idx; if (!info->info.attrs[NFC_ATTR_DEVICE_INDEX]) return ERR_PTR(-EINVAL); idx = nla_get_u32(info->info.attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return ERR_PTR(-ENODEV); return dev; } static int nfc_genl_dump_targets(struct sk_buff *skb, struct netlink_callback *cb) { int i = cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; int rc; if (!dev) { dev = __get_device_from_cb(cb); if (IS_ERR(dev)) return PTR_ERR(dev); cb->args[1] = (long) dev; } device_lock(&dev->dev); cb->seq = dev->targets_generation; while (i < dev->n_targets) { rc = nfc_genl_send_target(skb, &dev->targets[i], cb, NLM_F_MULTI); if (rc < 0) break; i++; } device_unlock(&dev->dev); cb->args[0] = i; return skb->len; } static int nfc_genl_dump_targets_done(struct netlink_callback *cb) { struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; if (dev) nfc_put_device(dev); return 0; } int nfc_genl_targets_found(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; dev->genl_data.poll_req_portid = 0; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TARGETS_FOUND); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_target_lost(struct nfc_dev *dev, u32 target_idx) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TARGET_LOST); if (!hdr) goto free_msg; if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_tm_activated(struct nfc_dev *dev, u32 protocol) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TM_ACTIVATED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (nla_put_u32(msg, NFC_ATTR_TM_PROTOCOLS, protocol)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_tm_deactivated(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_TM_DEACTIVATED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_setup_device_added(struct nfc_dev *dev, struct sk_buff *msg) { if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) || nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up) || nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode)) return -1; return 0; } int nfc_genl_device_added(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_DEVICE_ADDED); if (!hdr) goto free_msg; if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_device_removed(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_DEVICE_REMOVED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_llc_send_sdres(struct nfc_dev *dev, struct hlist_head *sdres_list) { struct sk_buff *msg; struct nlattr *sdp_attr, *uri_attr; struct nfc_llcp_sdp_tlv *sdres; struct hlist_node *n; void *hdr; int rc = -EMSGSIZE; int i; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_LLC_SDRES); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; sdp_attr = nla_nest_start_noflag(msg, NFC_ATTR_LLC_SDP); if (sdp_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; } i = 1; hlist_for_each_entry_safe(sdres, n, sdres_list, node) { pr_debug("uri: %s, sap: %d\n", sdres->uri, sdres->sap); uri_attr = nla_nest_start_noflag(msg, i++); if (uri_attr == NULL) { rc = -ENOMEM; goto nla_put_failure; } if (nla_put_u8(msg, NFC_SDP_ATTR_SAP, sdres->sap)) goto nla_put_failure; if (nla_put_string(msg, NFC_SDP_ATTR_URI, sdres->uri)) goto nla_put_failure; nla_nest_end(msg, uri_attr); hlist_del(&sdres->node); nfc_llcp_free_sdp_tlv(sdres); } nla_nest_end(msg, sdp_attr); genlmsg_end(msg, hdr); return genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); nla_put_failure: free_msg: nlmsg_free(msg); nfc_llcp_free_sdp_tlv_list(sdres_list); return rc; } int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_ADDED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, type)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_REMOVED); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx, struct nfc_evt_transaction *evt_transaction) { struct nfc_se *se; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_TRANSACTION); if (!hdr) goto free_msg; se = nfc_find_se(dev, se_idx); if (!se) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type) || nla_put(msg, NFC_ATTR_SE_AID, evt_transaction->aid_len, evt_transaction->aid) || nla_put(msg, NFC_ATTR_SE_PARAMS, evt_transaction->params_len, evt_transaction->params)) goto nla_put_failure; /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: /* evt_transaction is no more used */ devm_kfree(&dev->dev, evt_transaction); nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx) { const struct nfc_se *se; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_EVENT_SE_CONNECTIVITY); if (!hdr) goto free_msg; se = nfc_find_se(dev, se_idx); if (!se) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, NFC_CMD_GET_DEVICE); if (!hdr) return -EMSGSIZE; if (cb) genl_dump_check_consistent(cb, hdr); if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_dump_devices(struct sk_buff *skb, struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; bool first_call = false; if (!iter) { first_call = true; iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; } mutex_lock(&nfc_devlist_mutex); cb->seq = nfc_devlist_generation; if (first_call) { nfc_device_iter_init(iter); dev = nfc_device_iter_next(iter); } while (dev) { int rc; rc = nfc_genl_send_device(skb, dev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (rc < 0) break; dev = nfc_device_iter_next(iter); } mutex_unlock(&nfc_devlist_mutex); cb->args[1] = (long) dev; return skb->len; } static int nfc_genl_dump_devices_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; if (iter) { nfc_device_iter_exit(iter); kfree(iter); } return 0; } int nfc_genl_dep_link_up_event(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { struct sk_buff *msg; void *hdr; pr_debug("DEP link is up\n"); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_UP); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (rf_mode == NFC_RF_INITIATOR && nla_put_u32(msg, NFC_ATTR_TARGET_INDEX, target_idx)) goto nla_put_failure; if (nla_put_u8(msg, NFC_ATTR_COMM_MODE, comm_mode) || nla_put_u8(msg, NFC_ATTR_RF_MODE, rf_mode)) goto nla_put_failure; genlmsg_end(msg, hdr); dev->dep_link_up = true; genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } int nfc_genl_dep_link_down_event(struct nfc_dev *dev) { struct sk_buff *msg; void *hdr; pr_debug("DEP link is down\n"); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_DEP_LINK_DOWN); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_get_device(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; struct nfc_dev *dev; u32 idx; int rc = -ENOBUFS; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; goto out_putdev; } rc = nfc_genl_send_device(msg, dev, info->snd_portid, info->snd_seq, NULL, 0); if (rc < 0) goto out_free; nfc_put_device(dev); return genlmsg_reply(msg, info); out_free: nlmsg_free(msg); out_putdev: nfc_put_device(dev); return rc; } static int nfc_genl_dev_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dev_up(dev); nfc_put_device(dev); return rc; } static int nfc_genl_dev_down(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dev_down(dev); nfc_put_device(dev); return rc; } static int nfc_genl_start_poll(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; u32 im_protocols = 0, tm_protocols = 0; pr_debug("Poll start\n"); if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || ((!info->attrs[NFC_ATTR_IM_PROTOCOLS] && !info->attrs[NFC_ATTR_PROTOCOLS]) && !info->attrs[NFC_ATTR_TM_PROTOCOLS])) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); if (info->attrs[NFC_ATTR_TM_PROTOCOLS]) tm_protocols = nla_get_u32(info->attrs[NFC_ATTR_TM_PROTOCOLS]); if (info->attrs[NFC_ATTR_IM_PROTOCOLS]) im_protocols = nla_get_u32(info->attrs[NFC_ATTR_IM_PROTOCOLS]); else if (info->attrs[NFC_ATTR_PROTOCOLS]) im_protocols = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; mutex_lock(&dev->genl_data.genl_data_mutex); rc = nfc_start_poll(dev, im_protocols, tm_protocols); if (!rc) dev->genl_data.poll_req_portid = info->snd_portid; mutex_unlock(&dev->genl_data.genl_data_mutex); nfc_put_device(dev); return rc; } static int nfc_genl_stop_poll(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); if (!dev->polling) { device_unlock(&dev->dev); nfc_put_device(dev); return -EINVAL; } device_unlock(&dev->dev); mutex_lock(&dev->genl_data.genl_data_mutex); if (dev->genl_data.poll_req_portid != info->snd_portid) { rc = -EBUSY; goto out; } rc = nfc_stop_poll(dev); dev->genl_data.poll_req_portid = 0; out: mutex_unlock(&dev->genl_data.genl_data_mutex); nfc_put_device(dev); return rc; } static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; u32 device_idx, target_idx, protocol; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX] || !info->attrs[NFC_ATTR_PROTOCOLS]) return -EINVAL; device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(device_idx); if (!dev) return -ENODEV; target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); protocol = nla_get_u32(info->attrs[NFC_ATTR_PROTOCOLS]); nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); rc = nfc_activate_target(dev, target_idx, protocol); nfc_put_device(dev); return rc; } static int nfc_genl_deactivate_target(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; u32 device_idx, target_idx; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_TARGET_INDEX]) return -EINVAL; device_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(device_idx); if (!dev) return -ENODEV; target_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); rc = nfc_deactivate_target(dev, target_idx, NFC_TARGET_MODE_SLEEP); nfc_put_device(dev); return rc; } static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc, tgt_idx; u32 idx; u8 comm; pr_debug("DEP link up\n"); if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_COMM_MODE]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); if (!info->attrs[NFC_ATTR_TARGET_INDEX]) tgt_idx = NFC_TARGET_IDX_ANY; else tgt_idx = nla_get_u32(info->attrs[NFC_ATTR_TARGET_INDEX]); comm = nla_get_u8(info->attrs[NFC_ATTR_COMM_MODE]); if (comm != NFC_COMM_ACTIVE && comm != NFC_COMM_PASSIVE) return -EINVAL; dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dep_link_up(dev, tgt_idx, comm); nfc_put_device(dev); return rc; } static int nfc_genl_dep_link_down(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_dep_link_down(dev); nfc_put_device(dev); return rc; } static int nfc_genl_send_params(struct sk_buff *msg, struct nfc_llcp_local *local, u32 portid, u32 seq) { void *hdr; hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, 0, NFC_CMD_LLC_GET_PARAMS); if (!hdr) return -EMSGSIZE; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, local->dev->idx) || nla_put_u8(msg, NFC_ATTR_LLC_PARAM_LTO, local->lto) || nla_put_u8(msg, NFC_ATTR_LLC_PARAM_RW, local->rw) || nla_put_u16(msg, NFC_ATTR_LLC_PARAM_MIUX, be16_to_cpu(local->miux))) goto nla_put_failure; genlmsg_end(msg, hdr); return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_llc_get_params(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; int rc = 0; struct sk_buff *msg = NULL; u32 idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { rc = -ENOMEM; goto put_local; } rc = nfc_genl_send_params(msg, local, info->snd_portid, info->snd_seq); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); if (rc < 0) { if (msg) nlmsg_free(msg); return rc; } return genlmsg_reply(msg, info); } static int nfc_genl_llc_set_params(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; u8 rw = 0; u16 miux = 0; u32 idx; int rc = 0; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || (!info->attrs[NFC_ATTR_LLC_PARAM_LTO] && !info->attrs[NFC_ATTR_LLC_PARAM_RW] && !info->attrs[NFC_ATTR_LLC_PARAM_MIUX])) return -EINVAL; if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) { rw = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_RW]); if (rw > LLCP_MAX_RW) return -EINVAL; } if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) { miux = nla_get_u16(info->attrs[NFC_ATTR_LLC_PARAM_MIUX]); if (miux > LLCP_MAX_MIUX) return -EINVAL; } idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } if (info->attrs[NFC_ATTR_LLC_PARAM_LTO]) { if (dev->dep_link_up) { rc = -EINPROGRESS; goto put_local; } local->lto = nla_get_u8(info->attrs[NFC_ATTR_LLC_PARAM_LTO]); } if (info->attrs[NFC_ATTR_LLC_PARAM_RW]) local->rw = rw; if (info->attrs[NFC_ATTR_LLC_PARAM_MIUX]) local->miux = cpu_to_be16(miux); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); return rc; } static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct nfc_llcp_local *local; struct nlattr *attr, *sdp_attrs[NFC_SDP_ATTR_MAX+1]; u32 idx; u8 tid; char *uri; int rc = 0, rem; size_t uri_len, tlvs_len; struct hlist_head sdreq_list; struct nfc_llcp_sdp_tlv *sdreq; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_LLC_SDP]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; device_lock(&dev->dev); if (dev->dep_link_up == false) { rc = -ENOLINK; goto exit; } local = nfc_llcp_find_local(dev); if (!local) { rc = -ENODEV; goto exit; } INIT_HLIST_HEAD(&sdreq_list); tlvs_len = 0; nla_for_each_nested(attr, info->attrs[NFC_ATTR_LLC_SDP], rem) { rc = nla_parse_nested_deprecated(sdp_attrs, NFC_SDP_ATTR_MAX, attr, nfc_sdp_genl_policy, info->extack); if (rc != 0) { rc = -EINVAL; goto put_local; } if (!sdp_attrs[NFC_SDP_ATTR_URI]) continue; uri_len = nla_len(sdp_attrs[NFC_SDP_ATTR_URI]); if (uri_len == 0) continue; uri = nla_data(sdp_attrs[NFC_SDP_ATTR_URI]); if (*uri == 0) continue; tid = local->sdreq_next_tid++; sdreq = nfc_llcp_build_sdreq_tlv(tid, uri, uri_len); if (sdreq == NULL) { rc = -ENOMEM; goto put_local; } tlvs_len += sdreq->tlv_len; hlist_add_head(&sdreq->node, &sdreq_list); } if (hlist_empty(&sdreq_list)) { rc = -EINVAL; goto put_local; } rc = nfc_llcp_send_snl_sdreq(local, &sdreq_list, tlvs_len); put_local: nfc_llcp_local_put(local); exit: device_unlock(&dev->dev); nfc_put_device(dev); return rc; } static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx; char firmware_name[NFC_FIRMWARE_NAME_MAXSIZE + 1]; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_FIRMWARE_NAME]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; nla_strscpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], sizeof(firmware_name)); rc = nfc_fw_download(dev, firmware_name); nfc_put_device(dev); return rc; } int nfc_genl_fw_download_done(struct nfc_dev *dev, const char *firmware_name, u32 result) { struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!msg) return -ENOMEM; hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_FW_DOWNLOAD); if (!hdr) goto free_msg; if (nla_put_string(msg, NFC_ATTR_FIRMWARE_NAME, firmware_name) || nla_put_u32(msg, NFC_ATTR_FIRMWARE_DOWNLOAD_STATUS, result) || nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_ATOMIC); return 0; nla_put_failure: free_msg: nlmsg_free(msg); return -EMSGSIZE; } static int nfc_genl_enable_se(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx, se_idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_enable_se(dev, se_idx); nfc_put_device(dev); return rc; } static int nfc_genl_disable_se(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; int rc; u32 idx, se_idx; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX]) return -EINVAL; idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(idx); if (!dev) return -ENODEV; rc = nfc_disable_se(dev, se_idx); nfc_put_device(dev); return rc; } static int nfc_genl_send_se(struct sk_buff *msg, struct nfc_dev *dev, u32 portid, u32 seq, struct netlink_callback *cb, int flags) { void *hdr; struct nfc_se *se, *n; list_for_each_entry_safe(se, n, &dev->secure_elements, list) { hdr = genlmsg_put(msg, portid, seq, &nfc_genl_family, flags, NFC_CMD_GET_SE); if (!hdr) goto nla_put_failure; if (cb) genl_dump_check_consistent(cb, hdr); if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, se->idx) || nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type)) goto nla_put_failure; genlmsg_end(msg, hdr); } return 0; nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static int nfc_genl_dump_ses(struct sk_buff *skb, struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; struct nfc_dev *dev = (struct nfc_dev *) cb->args[1]; bool first_call = false; if (!iter) { first_call = true; iter = kmalloc(sizeof(struct class_dev_iter), GFP_KERNEL); if (!iter) return -ENOMEM; cb->args[0] = (long) iter; } mutex_lock(&nfc_devlist_mutex); cb->seq = nfc_devlist_generation; if (first_call) { nfc_device_iter_init(iter); dev = nfc_device_iter_next(iter); } while (dev) { int rc; rc = nfc_genl_send_se(skb, dev, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb, NLM_F_MULTI); if (rc < 0) break; dev = nfc_device_iter_next(iter); } mutex_unlock(&nfc_devlist_mutex); cb->args[1] = (long) dev; return skb->len; } static int nfc_genl_dump_ses_done(struct netlink_callback *cb) { struct class_dev_iter *iter = (struct class_dev_iter *) cb->args[0]; if (iter) { nfc_device_iter_exit(iter); kfree(iter); } return 0; } static int nfc_se_io(struct nfc_dev *dev, u32 se_idx, u8 *apdu, size_t apdu_length, se_io_cb_t cb, void *cb_context) { struct nfc_se *se; int rc; pr_debug("%s se index %d\n", dev_name(&dev->dev), se_idx); device_lock(&dev->dev); if (!device_is_registered(&dev->dev)) { rc = -ENODEV; goto error; } if (!dev->dev_up) { rc = -ENODEV; goto error; } if (!dev->ops->se_io) { rc = -EOPNOTSUPP; goto error; } se = nfc_find_se(dev, se_idx); if (!se) { rc = -EINVAL; goto error; } if (se->state != NFC_SE_ENABLED) { rc = -ENODEV; goto error; } rc = dev->ops->se_io(dev, se_idx, apdu, apdu_length, cb, cb_context); device_unlock(&dev->dev); return rc; error: device_unlock(&dev->dev); kfree(cb_context); return rc; } struct se_io_ctx { u32 dev_idx; u32 se_idx; }; static void se_io_cb(void *context, u8 *apdu, size_t apdu_len, int err) { struct se_io_ctx *ctx = context; struct sk_buff *msg; void *hdr; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) { kfree(ctx); return; } hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0, NFC_CMD_SE_IO); if (!hdr) goto free_msg; if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, ctx->dev_idx) || nla_put_u32(msg, NFC_ATTR_SE_INDEX, ctx->se_idx) || nla_put(msg, NFC_ATTR_SE_APDU, apdu_len, apdu)) goto nla_put_failure; genlmsg_end(msg, hdr); genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL); kfree(ctx); return; nla_put_failure: free_msg: nlmsg_free(msg); kfree(ctx); return; } static int nfc_genl_se_io(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; struct se_io_ctx *ctx; u32 dev_idx, se_idx; u8 *apdu; size_t apdu_len; int rc; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_SE_INDEX] || !info->attrs[NFC_ATTR_SE_APDU]) return -EINVAL; dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); se_idx = nla_get_u32(info->attrs[NFC_ATTR_SE_INDEX]); dev = nfc_get_device(dev_idx); if (!dev) return -ENODEV; if (!dev->ops || !dev->ops->se_io) { rc = -EOPNOTSUPP; goto put_dev; } apdu_len = nla_len(info->attrs[NFC_ATTR_SE_APDU]); if (apdu_len == 0) { rc = -EINVAL; goto put_dev; } apdu = nla_data(info->attrs[NFC_ATTR_SE_APDU]); ctx = kzalloc(sizeof(struct se_io_ctx), GFP_KERNEL); if (!ctx) { rc = -ENOMEM; goto put_dev; } ctx->dev_idx = dev_idx; ctx->se_idx = se_idx; rc = nfc_se_io(dev, se_idx, apdu, apdu_len, se_io_cb, ctx); put_dev: nfc_put_device(dev); return rc; } static int nfc_genl_vendor_cmd(struct sk_buff *skb, struct genl_info *info) { struct nfc_dev *dev; const struct nfc_vendor_cmd *cmd; u32 dev_idx, vid, subcmd; u8 *data; size_t data_len; int i, err; if (!info->attrs[NFC_ATTR_DEVICE_INDEX] || !info->attrs[NFC_ATTR_VENDOR_ID] || !info->attrs[NFC_ATTR_VENDOR_SUBCMD]) return -EINVAL; dev_idx = nla_get_u32(info->attrs[NFC_ATTR_DEVICE_INDEX]); vid = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_ID]); subcmd = nla_get_u32(info->attrs[NFC_ATTR_VENDOR_SUBCMD]); dev = nfc_get_device(dev_idx); if (!dev) return -ENODEV; if (!dev->vendor_cmds || !dev->n_vendor_cmds) { err = -ENODEV; goto put_dev; } if (info->attrs[NFC_ATTR_VENDOR_DATA]) { data = nla_data(info->attrs[NFC_ATTR_VENDOR_DATA]); data_len = nla_len(info->attrs[NFC_ATTR_VENDOR_DATA]); if (data_len == 0) { err = -EINVAL; goto put_dev; } } else { data = NULL; data_len = 0; } for (i = 0; i < dev->n_vendor_cmds; i++) { cmd = &dev->vendor_cmds[i]; if (cmd->vendor_id != vid || cmd->subcmd != subcmd) continue; dev->cur_cmd_info = info; err = cmd->doit(dev, data, data_len); dev->cur_cmd_info = NULL; goto put_dev; } err = -EOPNOTSUPP; put_dev: nfc_put_device(dev); return err; } /* message building helper */ static inline void *nfc_hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd) { /* since there is no private header just add the generic one */ return genlmsg_put(skb, portid, seq, &nfc_genl_family, flags, cmd); } static struct sk_buff * __nfc_alloc_vendor_cmd_skb(struct nfc_dev *dev, int approxlen, u32 portid, u32 seq, enum nfc_attrs attr, u32 oui, u32 subcmd, gfp_t gfp) { struct sk_buff *skb; void *hdr; skb = nlmsg_new(approxlen + 100, gfp); if (!skb) return NULL; hdr = nfc_hdr_put(skb, portid, seq, 0, NFC_CMD_VENDOR); if (!hdr) { kfree_skb(skb); return NULL; } if (nla_put_u32(skb, NFC_ATTR_DEVICE_INDEX, dev->idx)) goto nla_put_failure; if (nla_put_u32(skb, NFC_ATTR_VENDOR_ID, oui)) goto nla_put_failure; if (nla_put_u32(skb, NFC_ATTR_VENDOR_SUBCMD, subcmd)) goto nla_put_failure; ((void **)skb->cb)[0] = dev; ((void **)skb->cb)[1] = hdr; return skb; nla_put_failure: kfree_skb(skb); return NULL; } struct sk_buff *__nfc_alloc_vendor_cmd_reply_skb(struct nfc_dev *dev, enum nfc_attrs attr, u32 oui, u32 subcmd, int approxlen) { if (WARN_ON(!dev->cur_cmd_info)) return NULL; return __nfc_alloc_vendor_cmd_skb(dev, approxlen, dev->cur_cmd_info->snd_portid, dev->cur_cmd_info->snd_seq, attr, oui, subcmd, GFP_KERNEL); } EXPORT_SYMBOL(__nfc_alloc_vendor_cmd_reply_skb); int nfc_vendor_cmd_reply(struct sk_buff *skb) { struct nfc_dev *dev = ((void **)skb->cb)[0]; void *hdr = ((void **)skb->cb)[1]; /* clear CB data for netlink core to own from now on */ memset(skb->cb, 0, sizeof(skb->cb)); if (WARN_ON(!dev->cur_cmd_info)) { kfree_skb(skb); return -EINVAL; } genlmsg_end(skb, hdr); return genlmsg_reply(skb, dev->cur_cmd_info); } EXPORT_SYMBOL(nfc_vendor_cmd_reply); static const struct genl_ops nfc_genl_ops[] = { { .cmd = NFC_CMD_GET_DEVICE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_get_device, .dumpit = nfc_genl_dump_devices, .done = nfc_genl_dump_devices_done, }, { .cmd = NFC_CMD_DEV_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_up, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEV_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dev_down, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_START_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_start_poll, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_STOP_POLL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_stop_poll, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_UP, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_up, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEP_LINK_DOWN, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_dep_link_down, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP_STRICT, .dumpit = nfc_genl_dump_targets, .done = nfc_genl_dump_targets_done, }, { .cmd = NFC_CMD_LLC_GET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_get_params, }, { .cmd = NFC_CMD_LLC_SET_PARAMS, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_set_params, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_LLC_SDREQ, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_llc_sdreq, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_FW_DOWNLOAD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_fw_download, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ENABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_enable_se, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DISABLE_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_disable_se, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_GET_SE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = nfc_genl_dump_ses, .done = nfc_genl_dump_ses_done, }, { .cmd = NFC_CMD_SE_IO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_se_io, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_ACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_activate_target, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_VENDOR, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_vendor_cmd, .flags = GENL_ADMIN_PERM, }, { .cmd = NFC_CMD_DEACTIVATE_TARGET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nfc_genl_deactivate_target, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family nfc_genl_family __ro_after_init = { .hdrsize = 0, .name = NFC_GENL_NAME, .version = NFC_GENL_VERSION, .maxattr = NFC_ATTR_MAX, .policy = nfc_genl_policy, .module = THIS_MODULE, .ops = nfc_genl_ops, .n_ops = ARRAY_SIZE(nfc_genl_ops), .resv_start_op = NFC_CMD_DEACTIVATE_TARGET + 1, .mcgrps = nfc_genl_mcgrps, .n_mcgrps = ARRAY_SIZE(nfc_genl_mcgrps), }; struct urelease_work { struct work_struct w; u32 portid; }; static void nfc_urelease_event_work(struct work_struct *work) { struct urelease_work *w = container_of(work, struct urelease_work, w); struct class_dev_iter iter; struct nfc_dev *dev; pr_debug("portid %d\n", w->portid); mutex_lock(&nfc_devlist_mutex); nfc_device_iter_init(&iter); dev = nfc_device_iter_next(&iter); while (dev) { mutex_lock(&dev->genl_data.genl_data_mutex); if (dev->genl_data.poll_req_portid == w->portid) { nfc_stop_poll(dev); dev->genl_data.poll_req_portid = 0; } mutex_unlock(&dev->genl_data.genl_data_mutex); dev = nfc_device_iter_next(&iter); } nfc_device_iter_exit(&iter); mutex_unlock(&nfc_devlist_mutex); kfree(w); } static int nfc_genl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct urelease_work *w; if (event != NETLINK_URELEASE || n->protocol != NETLINK_GENERIC) goto out; pr_debug("NETLINK_URELEASE event from id %d\n", n->portid); w = kmalloc(sizeof(*w), GFP_ATOMIC); if (w) { INIT_WORK(&w->w, nfc_urelease_event_work); w->portid = n->portid; schedule_work(&w->w); } out: return NOTIFY_DONE; } void nfc_genl_data_init(struct nfc_genl_data *genl_data) { genl_data->poll_req_portid = 0; mutex_init(&genl_data->genl_data_mutex); } void nfc_genl_data_exit(struct nfc_genl_data *genl_data) { mutex_destroy(&genl_data->genl_data_mutex); } static struct notifier_block nl_notifier = { .notifier_call = nfc_genl_rcv_nl_event, }; /** * nfc_genl_init() - Initialize netlink interface * * This initialization function registers the nfc netlink family. */ int __init nfc_genl_init(void) { int rc; rc = genl_register_family(&nfc_genl_family); if (rc) return rc; netlink_register_notifier(&nl_notifier); return 0; } /** * nfc_genl_exit() - Deinitialize netlink interface * * This exit function unregisters the nfc netlink family. */ void nfc_genl_exit(void) { netlink_unregister_notifier(&nl_notifier); genl_unregister_family(&nfc_genl_family); }
2 9 8 6 2 3 1 1 3 90 23 16 6 11 16 22 82 2 79 9 72 10 2 3 30 36 26 26 15 35 3 2 48 22 19 24 71 63 4 56 67 10 56 24 6 11 13 48 27 16 50 82 83 49 22 22 19 16 27 2 2 15 10 6 5 15 9 5 2 2 9 6 10 5 17 17 17 4 1 1 2 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 // SPDX-License-Identifier: GPL-2.0-only /* * sysctl.c: General linux system control interface */ #include <linux/sysctl.h> #include <linux/bitmap.h> #include <linux/proc_fs.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/highuid.h> #include <linux/writeback.h> #include <linux/initrd.h> #include <linux/times.h> #include <linux/limits.h> #include <linux/syscalls.h> #include <linux/capability.h> #include "../lib/kstrtox.h" #include <linux/uaccess.h> #include <asm/processor.h> /* shared constants to be used in various sysctls */ const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; EXPORT_SYMBOL(sysctl_vals); const unsigned long sysctl_long_vals[] = { 0, 1, LONG_MAX }; EXPORT_SYMBOL_GPL(sysctl_long_vals); #if defined(CONFIG_SYSCTL) /* Constants used for minimum and maximum */ static const int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; #ifdef CONFIG_PROC_SYSCTL /** * enum sysctl_writes_mode - supported sysctl write modes * * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value * to be written, and multiple writes on the same sysctl file descriptor * will rewrite the sysctl value, regardless of file position. No warning * is issued when the initial position is not 0. * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is * not 0. * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at * file position 0 and the value must be fully contained in the buffer * sent to the write syscall. If dealing with strings respect the file * position, but restrict this to the max length of the buffer, anything * passed the max length will be ignored. Multiple writes will append * to the buffer. * * These write modes control how current file position affects the behavior of * updating sysctl values through the proc interface on each write. */ enum sysctl_writes_mode { SYSCTL_WRITES_LEGACY = -1, SYSCTL_WRITES_WARN = 0, SYSCTL_WRITES_STRICT = 1, }; static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; #endif /* CONFIG_PROC_SYSCTL */ #endif /* CONFIG_SYSCTL */ /* * /proc/sys support */ #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(char *data, int maxlen, int write, char *buffer, size_t *lenp, loff_t *ppos) { size_t len; char c, *p; if (!data || !maxlen || !*lenp) { *lenp = 0; return 0; } if (write) { if (sysctl_writes_strict == SYSCTL_WRITES_STRICT) { /* Only continue writes not past the end of buffer. */ len = strlen(data); if (len > maxlen - 1) len = maxlen - 1; if (*ppos > len) return 0; len = *ppos; } else { /* Start writing from beginning of buffer. */ len = 0; } *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { c = *(p++); if (c == 0 || c == '\n') break; data[len++] = c; } data[len] = 0; } else { len = strlen(data); if (len > maxlen) len = maxlen; if (*ppos > len) { *lenp = 0; return 0; } data += *ppos; len -= *ppos; if (len > *lenp) len = *lenp; if (len) memcpy(buffer, data, len); if (len < *lenp) { buffer[len] = '\n'; len++; } *lenp = len; *ppos += len; } return 0; } static void warn_sysctl_write(const struct ctl_table *table) { pr_warn_once("%s wrote to %s when file position was not 0!\n" "This will not be supported in the future. To silence this\n" "warning, set kernel.sysctl_writes_strict = -1\n", current->comm, table->procname); } /** * proc_first_pos_non_zero_ignore - check if first position is allowed * @ppos: file position * @table: the sysctl table * * Returns true if the first position is non-zero and the sysctl_writes_strict * mode indicates this is not allowed for numeric input types. String proc * handlers can ignore the return value. */ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, const struct ctl_table *table) { if (!*ppos) return false; switch (sysctl_writes_strict) { case SYSCTL_WRITES_STRICT: return true; case SYSCTL_WRITES_WARN: warn_sysctl_write(table); return false; default: return false; } } /** * proc_dostring - read a string sysctl * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes a string from/to the user buffer. If the kernel * buffer provided is not large enough to hold the string, the * string is truncated. The copied string is %NULL-terminated. * If the string is being read by the user process, it is copied * and a newline '\n' is added. It is truncated if the buffer is * not large enough. * * Returns 0 on success. */ int proc_dostring(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { if (write) proc_first_pos_non_zero_ignore(ppos, table); return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, ppos); } static void proc_skip_spaces(char **buf, size_t *size) { while (*size) { if (!isspace(**buf)) break; (*size)--; (*buf)++; } } static void proc_skip_char(char **buf, size_t *size, const char v) { while (*size) { if (**buf != v) break; (*size)--; (*buf)++; } } /** * strtoul_lenient - parse an ASCII formatted integer from a buffer and only * fail on overflow * * @cp: kernel buffer containing the string to parse * @endp: pointer to store the trailing characters * @base: the base to use * @res: where the parsed integer will be stored * * In case of success 0 is returned and @res will contain the parsed integer, * @endp will hold any trailing characters. * This function will fail the parse on overflow. If there wasn't an overflow * the function will defer the decision what characters count as invalid to the * caller. */ static int strtoul_lenient(const char *cp, char **endp, unsigned int base, unsigned long *res) { unsigned long long result; unsigned int rv; cp = _parse_integer_fixup_radix(cp, &base); rv = _parse_integer(cp, base, &result); if ((rv & KSTRTOX_OVERFLOW) || (result != (unsigned long)result)) return -ERANGE; cp += rv; if (endp) *endp = (char *)cp; *res = (unsigned long)result; return 0; } #define TMPBUFLEN 22 /** * proc_get_long - reads an ASCII formatted integer from a user buffer * * @buf: a kernel buffer * @size: size of the kernel buffer * @val: this is where the number will be stored * @neg: set to %TRUE if number is negative * @perm_tr: a vector which contains the allowed trailers * @perm_tr_len: size of the perm_tr vector * @tr: pointer to store the trailer character * * In case of success %0 is returned and @buf and @size are updated with * the amount of bytes read. If @tr is non-NULL and a trailing * character exists (size is non-zero after returning from this * function), @tr is updated with the trailing character. */ static int proc_get_long(char **buf, size_t *size, unsigned long *val, bool *neg, const char *perm_tr, unsigned perm_tr_len, char *tr) { char *p, tmp[TMPBUFLEN]; ssize_t len = *size; if (len <= 0) return -EINVAL; if (len > TMPBUFLEN - 1) len = TMPBUFLEN - 1; memcpy(tmp, *buf, len); tmp[len] = 0; p = tmp; if (*p == '-' && *size > 1) { *neg = true; p++; } else *neg = false; if (!isdigit(*p)) return -EINVAL; if (strtoul_lenient(p, &p, 0, val)) return -EINVAL; len = p - tmp; /* We don't know if the next char is whitespace thus we may accept * invalid integers (e.g. 1234...a) or two integers instead of one * (e.g. 123...1). So lets not allow such large numbers. */ if (len == TMPBUFLEN - 1) return -EINVAL; if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) return -EINVAL; if (tr && (len < *size)) *tr = *p; *buf += len; *size -= len; return 0; } /** * proc_put_long - converts an integer to a decimal ASCII formatted string * * @buf: the user buffer * @size: the size of the user buffer * @val: the integer to be converted * @neg: sign of the number, %TRUE for negative * * In case of success @buf and @size are updated with the amount of bytes * written. */ static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg) { int len; char tmp[TMPBUFLEN], *p = tmp; sprintf(p, "%s%lu", neg ? "-" : "", val); len = strlen(tmp); if (len > *size) len = *size; memcpy(*buf, tmp, len); *size -= len; *buf += len; } #undef TMPBUFLEN static void proc_put_char(void **buf, size_t *size, char c) { if (*size) { char **buffer = (char **)buf; **buffer = c; (*size)--; (*buffer)++; *buf = *buffer; } } static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { if (write) { if (*negp) { if (*lvalp > (unsigned long) INT_MAX + 1) return -EINVAL; WRITE_ONCE(*valp, -*lvalp); } else { if (*lvalp > (unsigned long) INT_MAX) return -EINVAL; WRITE_ONCE(*valp, *lvalp); } } else { int val = READ_ONCE(*valp); if (val < 0) { *negp = true; *lvalp = -(unsigned long)val; } else { *negp = false; *lvalp = (unsigned long)val; } } return 0; } static int do_proc_douintvec_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { if (write) { if (*lvalp > UINT_MAX) return -EINVAL; WRITE_ONCE(*valp, *lvalp); } else { unsigned int val = READ_ONCE(*valp); *lvalp = (unsigned long)val; } return 0; } static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { int *i, vleft, first = 1, err = 0; size_t left; char *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } i = (int *) tbl_data; vleft = table->maxlen / sizeof(*i); left = *lenp; if (!conv) conv = do_proc_dointvec_conv; if (write) { if (proc_first_pos_non_zero_ignore(ppos, table)) goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; p = buffer; } for (; left && vleft--; i++, first=0) { unsigned long lval; bool neg; if (write) { proc_skip_spaces(&p, &left); if (!left) break; err = proc_get_long(&p, &left, &lval, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err) break; if (conv(&neg, &lval, i, 1, data)) { err = -EINVAL; break; } } else { if (conv(&neg, &lval, i, 0, data)) { err = -EINVAL; break; } if (!first) proc_put_char(&buffer, &left, '\t'); proc_put_long(&buffer, &left, lval, neg); } } if (!write && !first && left && !err) proc_put_char(&buffer, &left, '\n'); if (write && !err && left) proc_skip_spaces(&p, &left); if (write && first) return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; return err; } static int do_proc_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { return __do_proc_dointvec(table->data, table, write, buffer, lenp, ppos, conv, data); } static int do_proc_douintvec_w(unsigned int *tbl_data, const struct ctl_table *table, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), void *data) { unsigned long lval; int err = 0; size_t left; bool neg; char *p = buffer; left = *lenp; if (proc_first_pos_non_zero_ignore(ppos, table)) goto bail_early; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; proc_skip_spaces(&p, &left); if (!left) { err = -EINVAL; goto out_free; } err = proc_get_long(&p, &left, &lval, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err || neg) { err = -EINVAL; goto out_free; } if (conv(&lval, tbl_data, 1, data)) { err = -EINVAL; goto out_free; } if (!err && left) proc_skip_spaces(&p, &left); out_free: if (err) return -EINVAL; return 0; /* This is in keeping with old __do_proc_dointvec() */ bail_early: *ppos += *lenp; return err; } static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), void *data) { unsigned long lval; int err = 0; size_t left; left = *lenp; if (conv(&lval, tbl_data, 0, data)) { err = -EINVAL; goto out; } proc_put_long(&buffer, &left, lval, false); if (!left) goto out; proc_put_char(&buffer, &left, '\n'); out: *lenp -= left; *ppos += *lenp; return err; } static int __do_proc_douintvec(void *tbl_data, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), void *data) { unsigned int *i, vleft; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } i = (unsigned int *) tbl_data; vleft = table->maxlen / sizeof(*i); /* * Arrays are not supported, keep this simple. *Do not* add * support for them. */ if (vleft != 1) { *lenp = 0; return -EINVAL; } if (!conv) conv = do_proc_douintvec_conv; if (write) return do_proc_douintvec_w(i, table, buffer, lenp, ppos, conv, data); return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data); } int do_proc_douintvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), void *data) { return __do_proc_douintvec(table->data, table, write, buffer, lenp, ppos, conv, data); } /** * proc_dobool - read/write a bool * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes one integer value from/to the user buffer, * treated as an ASCII string. * * table->data must point to a bool variable and table->maxlen must * be sizeof(bool). * * Returns 0 on success. */ int proc_dobool(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp; bool *data = table->data; int res, val; /* Do not support arrays yet. */ if (table->maxlen != sizeof(bool)) return -EINVAL; tmp = *table; tmp.maxlen = sizeof(val); tmp.data = &val; val = READ_ONCE(*data); res = proc_dointvec(&tmp, write, buffer, lenp, ppos); if (res) return res; if (write) WRITE_ONCE(*data, val); return 0; } /** * proc_dointvec - read a vector of integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer * values from/to the user buffer, treated as an ASCII string. * * Returns 0 on success. */ int proc_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); } /** * proc_douintvec - read a vector of unsigned integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer * values from/to the user buffer, treated as an ASCII string. * * Returns 0 on success. */ int proc_douintvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_douintvec_conv, NULL); } /** * struct do_proc_dointvec_minmax_conv_param - proc_dointvec_minmax() range checking structure * @min: pointer to minimum allowable value * @max: pointer to maximum allowable value * * The do_proc_dointvec_minmax_conv_param structure provides the * minimum and maximum values for doing range checking for those sysctl * parameters that use the proc_dointvec_minmax() handler. */ struct do_proc_dointvec_minmax_conv_param { int *min; int *max; }; static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { int tmp, ret; struct do_proc_dointvec_minmax_conv_param *param = data; /* * If writing, first do so via a temporary local int so we can * bounds-check it before touching *valp. */ int *ip = write ? &tmp : valp; ret = do_proc_dointvec_conv(negp, lvalp, ip, write, data); if (ret) return ret; if (write) { if ((param->min && *param->min > tmp) || (param->max && *param->max < tmp)) return -EINVAL; WRITE_ONCE(*valp, tmp); } return 0; } /** * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer * values from/to the user buffer, treated as an ASCII string. * * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, .max = (int *) table->extra2, }; return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_minmax_conv, &param); } /** * struct do_proc_douintvec_minmax_conv_param - proc_douintvec_minmax() range checking structure * @min: pointer to minimum allowable value * @max: pointer to maximum allowable value * * The do_proc_douintvec_minmax_conv_param structure provides the * minimum and maximum values for doing range checking for those sysctl * parameters that use the proc_douintvec_minmax() handler. */ struct do_proc_douintvec_minmax_conv_param { unsigned int *min; unsigned int *max; }; static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { int ret; unsigned int tmp; struct do_proc_douintvec_minmax_conv_param *param = data; /* write via temporary local uint for bounds-checking */ unsigned int *up = write ? &tmp : valp; ret = do_proc_douintvec_conv(lvalp, up, write, data); if (ret) return ret; if (write) { if ((param->min && *param->min > tmp) || (param->max && *param->max < tmp)) return -ERANGE; WRITE_ONCE(*valp, tmp); } return 0; } /** * proc_douintvec_minmax - read a vector of unsigned ints with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer * values from/to the user buffer, treated as an ASCII string. Negative * strings are not allowed. * * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). There is a final sanity * check for UINT_MAX to avoid having to support wrap around uses from * userspace. * * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_douintvec_minmax_conv_param param = { .min = (unsigned int *) table->extra1, .max = (unsigned int *) table->extra2, }; return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_douintvec_minmax_conv, &param); } /** * proc_dou8vec_minmax - read a vector of unsigned chars with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(u8) unsigned chars * values from/to the user buffer, treated as an ASCII string. Negative * strings are not allowed. * * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * * Returns 0 on success or an error on write when the range check fails. */ int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tmp; unsigned int min = 0, max = 255U, val; u8 *data = table->data; struct do_proc_douintvec_minmax_conv_param param = { .min = &min, .max = &max, }; int res; /* Do not support arrays yet. */ if (table->maxlen != sizeof(u8)) return -EINVAL; if (table->extra1) min = *(unsigned int *) table->extra1; if (table->extra2) max = *(unsigned int *) table->extra2; tmp = *table; tmp.maxlen = sizeof(val); tmp.data = &val; val = READ_ONCE(*data); res = do_proc_douintvec(&tmp, write, buffer, lenp, ppos, do_proc_douintvec_minmax_conv, &param); if (res) return res; if (write) WRITE_ONCE(*data, val); return 0; } EXPORT_SYMBOL_GPL(proc_dou8vec_minmax); static int __do_proc_doulongvec_minmax(void *data, const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; size_t left; char *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; return 0; } i = data; min = table->extra1; max = table->extra2; vleft = table->maxlen / sizeof(unsigned long); left = *lenp; if (write) { if (proc_first_pos_non_zero_ignore(ppos, table)) goto out; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; p = buffer; } for (; left && vleft--; i++, first = 0) { unsigned long val; if (write) { bool neg; proc_skip_spaces(&p, &left); if (!left) break; err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err || neg) { err = -EINVAL; break; } val = convmul * val / convdiv; if ((min && val < *min) || (max && val > *max)) { err = -EINVAL; break; } WRITE_ONCE(*i, val); } else { val = convdiv * READ_ONCE(*i) / convmul; if (!first) proc_put_char(&buffer, &left, '\t'); proc_put_long(&buffer, &left, val, false); } } if (!write && !first && left && !err) proc_put_char(&buffer, &left, '\n'); if (write && !err) proc_skip_spaces(&p, &left); if (write && first) return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; return err; } static int do_proc_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, buffer, lenp, ppos, convmul, convdiv); } /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long * values from/to the user buffer, treated as an ASCII string. * * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * * Returns 0 on success. */ int proc_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } /** * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long * values from/to the user buffer, treated as an ASCII string. The values * are treated as milliseconds, and converted to jiffies when they are stored. * * This routine will ensure the values are within the range specified by * table->extra1 (min) and table->extra2 (max). * * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); } static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { if (write) { if (*lvalp > INT_MAX / HZ) return 1; if (*negp) WRITE_ONCE(*valp, -*lvalp * HZ); else WRITE_ONCE(*valp, *lvalp * HZ); } else { int val = READ_ONCE(*valp); unsigned long lval; if (val < 0) { *negp = true; lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; } *lvalp = lval / HZ; } return 0; } static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { if (write) { if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) return 1; *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); } else { int val = *valp; unsigned long lval; if (val < 0) { *negp = true; lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; } *lvalp = jiffies_to_clock_t(lval); } return 0; } static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { if (write) { unsigned long jif = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); if (jif > INT_MAX) return 1; WRITE_ONCE(*valp, (int)jif); } else { int val = READ_ONCE(*valp); unsigned long lval; if (val < 0) { *negp = true; lval = -(unsigned long)val; } else { *negp = false; lval = (unsigned long)val; } *lvalp = jiffies_to_msecs(lval); } return 0; } static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp, int *valp, int write, void *data) { int tmp, ret; struct do_proc_dointvec_minmax_conv_param *param = data; /* * If writing, first do so via a temporary local int so we can * bounds-check it before touching *valp. */ int *ip = write ? &tmp : valp; ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data); if (ret) return ret; if (write) { if ((param->min && *param->min > tmp) || (param->max && *param->max < tmp)) return -EINVAL; *valp = tmp; } return 0; } /** * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer * values from/to the user buffer, treated as an ASCII string. * The values read are assumed to be in seconds, and are converted into * jiffies. * * Returns 0 on success. */ int proc_dointvec_jiffies(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); } int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, .max = (int *) table->extra2, }; return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_minmax_conv, &param); } /** * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: pointer to the file position * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer * values from/to the user buffer, treated as an ASCII string. * The values read are assumed to be in 1/USER_HZ seconds, and * are converted into jiffies. * * Returns 0 on success. */ int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_userhz_jiffies_conv, NULL); } /** * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: the current position in the file * * Reads/writes up to table->maxlen/sizeof(unsigned int) integer * values from/to the user buffer, treated as an ASCII string. * The values read are assumed to be in 1/1000 seconds, and * are converted into jiffies. * * Returns 0 on success. */ int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } /** * proc_do_large_bitmap - read/write from/to a large bitmap * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position * * The bitmap is stored at table->data and the bitmap length (in bits) * in table->maxlen. * * We use a range comma separated format (e.g. 1,3-4,10-10) so that * large bitmaps may be represented in a compact manner. Writing into * the file will clear the bitmap then update it with the given input. * * Returns 0 on success. */ int proc_do_large_bitmap(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int err = 0; size_t left = *lenp; unsigned long bitmap_len = table->maxlen; unsigned long *bitmap = *(unsigned long **) table->data; unsigned long *tmp_bitmap = NULL; char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; if (!bitmap || !bitmap_len || !left || (*ppos && !write)) { *lenp = 0; return 0; } if (write) { char *p = buffer; size_t skipped = 0; if (left > PAGE_SIZE - 1) { left = PAGE_SIZE - 1; /* How much of the buffer we'll skip this pass */ skipped = *lenp - left; } tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); if (!tmp_bitmap) return -ENOMEM; proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; bool neg; size_t saved_left; /* In case we stop parsing mid-number, we can reset */ saved_left = left; err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); /* * If we consumed the entirety of a truncated buffer or * only one char is left (may be a "-"), then stop here, * reset, & come back for more. */ if ((left <= 1) && skipped) { left = saved_left; break; } if (err) break; if (val_a >= bitmap_len || neg) { err = -EINVAL; break; } val_b = val_a; if (left) { p++; left--; } if (c == '-') { err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); /* * If we consumed all of a truncated buffer or * then stop here, reset, & come back for more. */ if (!left && skipped) { left = saved_left; break; } if (err) break; if (val_b >= bitmap_len || neg || val_a > val_b) { err = -EINVAL; break; } if (left) { p++; left--; } } bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); proc_skip_char(&p, &left, '\n'); } left += skipped; } else { unsigned long bit_a, bit_b = 0; bool first = 1; while (left) { bit_a = find_next_bit(bitmap, bitmap_len, bit_b); if (bit_a >= bitmap_len) break; bit_b = find_next_zero_bit(bitmap, bitmap_len, bit_a + 1) - 1; if (!first) proc_put_char(&buffer, &left, ','); proc_put_long(&buffer, &left, bit_a, false); if (bit_a != bit_b) { proc_put_char(&buffer, &left, '-'); proc_put_long(&buffer, &left, bit_b, false); } first = 0; bit_b++; } proc_put_char(&buffer, &left, '\n'); } if (!err) { if (write) { if (*ppos) bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); else bitmap_copy(bitmap, tmp_bitmap, bitmap_len); } *lenp -= left; *ppos += *lenp; } bitmap_free(tmp_bitmap); return err; } #else /* CONFIG_PROC_SYSCTL */ int proc_dostring(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dobool(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dou8vec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_jiffies(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_ms_jiffies_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_userhz_jiffies(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_ms_jiffies(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_do_large_bitmap(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } #endif /* CONFIG_PROC_SYSCTL */ #if defined(CONFIG_SYSCTL) int proc_do_static_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static DEFINE_MUTEX(static_key_mutex); int val, ret; struct ctl_table tmp = { .data = &val, .maxlen = sizeof(val), .mode = table->mode, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }; if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; mutex_lock(&static_key_mutex); val = static_key_enabled(key); ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && !ret) { if (val) static_key_enable(key); else static_key_disable(key); } mutex_unlock(&static_key_mutex); return ret; } static const struct ctl_table sysctl_subsys_table[] = { #ifdef CONFIG_PROC_SYSCTL { .procname = "sysctl_writes_strict", .data = &sysctl_writes_strict, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_NEG_ONE, .extra2 = SYSCTL_ONE, }, #endif { .procname = "ngroups_max", .data = (void *)&ngroups_max, .maxlen = sizeof (int), .mode = 0444, .proc_handler = proc_dointvec, }, { .procname = "cap_last_cap", .data = (void *)&cap_last_cap, .maxlen = sizeof(int), .mode = 0444, .proc_handler = proc_dointvec, }, #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_ALLOW { .procname = "unaligned-trap", .data = &unaligned_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif #ifdef CONFIG_SYSCTL_ARCH_UNALIGN_NO_WARN { .procname = "ignore-unaligned-usertrap", .data = &no_unaligned_warning, .maxlen = sizeof (int), .mode = 0644, .proc_handler = proc_dointvec, }, #endif }; int __init sysctl_init_bases(void) { register_sysctl_init("kernel", sysctl_subsys_table); return 0; } #endif /* CONFIG_SYSCTL */ /* * No sense putting this after each symbol definition, twice, * exception granted :-) */ EXPORT_SYMBOL(proc_dobool); EXPORT_SYMBOL(proc_dointvec); EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); EXPORT_SYMBOL(proc_doulongvec_minmax); EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); EXPORT_SYMBOL(proc_do_large_bitmap);
83 83 82 83 83 82 83 83 83 83 83 83 83 83 83 83 82 83 83 83 83 83 82 83 83 83 83 83 83 83 82 83 83 83 83 83 83 83 83 82 82 82 81 82 82 82 82 80 80 81 82 82 82 80 80 80 80 80 80 80 85 85 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 83 82 5 5 83 83 82 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. */ #include <rdma/uverbs_ioctl.h> #include <rdma/rdma_user_ioctl.h> #include <linux/bitops.h> #include "rdma_core.h" #include "uverbs.h" static int ib_uverbs_notsupp(struct uverbs_attr_bundle *attrs) { return -EOPNOTSUPP; } static void *uapi_add_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size) { void *elm; int rc; if (key == UVERBS_API_KEY_ERR) return ERR_PTR(-EOVERFLOW); elm = kzalloc(alloc_size, GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); rc = radix_tree_insert(&uapi->radix, key, elm); if (rc) { kfree(elm); return ERR_PTR(rc); } return elm; } static void *uapi_add_get_elm(struct uverbs_api *uapi, u32 key, size_t alloc_size, bool *exists) { void *elm; elm = uapi_add_elm(uapi, key, alloc_size); if (!IS_ERR(elm)) { *exists = false; return elm; } if (elm != ERR_PTR(-EEXIST)) return elm; elm = radix_tree_lookup(&uapi->radix, key); if (WARN_ON(!elm)) return ERR_PTR(-EINVAL); *exists = true; return elm; } static int uapi_create_write(struct uverbs_api *uapi, struct ib_device *ibdev, const struct uapi_definition *def, u32 obj_key, u32 *cur_method_key) { struct uverbs_api_write_method *method_elm; u32 method_key = obj_key; bool exists; if (def->write.is_ex) method_key |= uapi_key_write_ex_method(def->write.command_num); else method_key |= uapi_key_write_method(def->write.command_num); method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm), &exists); if (IS_ERR(method_elm)) return PTR_ERR(method_elm); if (WARN_ON(exists && (def->write.is_ex != method_elm->is_ex))) return -EINVAL; method_elm->is_ex = def->write.is_ex; method_elm->handler = def->func_write; if (!def->write.is_ex) method_elm->disabled = !(ibdev->uverbs_cmd_mask & BIT_ULL(def->write.command_num)); if (!def->write.is_ex && def->func_write) { method_elm->has_udata = def->write.has_udata; method_elm->has_resp = def->write.has_resp; method_elm->req_size = def->write.req_size; method_elm->resp_size = def->write.resp_size; } *cur_method_key = method_key; return 0; } static int uapi_merge_method(struct uverbs_api *uapi, struct uverbs_api_object *obj_elm, u32 obj_key, const struct uverbs_method_def *method, bool is_driver) { u32 method_key = obj_key | uapi_key_ioctl_method(method->id); struct uverbs_api_ioctl_method *method_elm; unsigned int i; bool exists; if (!method->attrs) return 0; method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm), &exists); if (IS_ERR(method_elm)) return PTR_ERR(method_elm); if (exists) { /* * This occurs when a driver uses ADD_UVERBS_ATTRIBUTES_SIMPLE */ if (WARN_ON(method->handler)) return -EINVAL; } else { WARN_ON(!method->handler); rcu_assign_pointer(method_elm->handler, method->handler); if (method->handler != uverbs_destroy_def_handler) method_elm->driver_method = is_driver; } for (i = 0; i != method->num_attrs; i++) { const struct uverbs_attr_def *attr = (*method->attrs)[i]; struct uverbs_api_attr *attr_slot; if (!attr) continue; /* * ENUM_IN contains the 'ids' pointer to the driver's .rodata, * so if it is specified by a driver then it always makes this * into a driver method. */ if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN) method_elm->driver_method |= is_driver; /* * Like other uobject based things we only support a single * uobject being NEW'd or DESTROY'd */ if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { u8 access = attr->attr.u2.objs_arr.access; if (WARN_ON(access == UVERBS_ACCESS_NEW || access == UVERBS_ACCESS_DESTROY)) return -EINVAL; } attr_slot = uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id), sizeof(*attr_slot)); /* Attributes are not allowed to be modified by drivers */ if (IS_ERR(attr_slot)) return PTR_ERR(attr_slot); attr_slot->spec = attr->attr; } return 0; } static int uapi_merge_obj_tree(struct uverbs_api *uapi, const struct uverbs_object_def *obj, bool is_driver) { struct uverbs_api_object *obj_elm; unsigned int i; u32 obj_key; bool exists; int rc; obj_key = uapi_key_obj(obj->id); obj_elm = uapi_add_get_elm(uapi, obj_key, sizeof(*obj_elm), &exists); if (IS_ERR(obj_elm)) return PTR_ERR(obj_elm); if (obj->type_attrs) { if (WARN_ON(obj_elm->type_attrs)) return -EINVAL; obj_elm->id = obj->id; obj_elm->type_attrs = obj->type_attrs; obj_elm->type_class = obj->type_attrs->type_class; /* * Today drivers are only permitted to use idr_class and * fd_class types. We can revoke the IDR types during * disassociation, and the FD types require the driver to use * struct file_operations.owner to prevent the driver module * code from unloading while the file is open. This provides * enough safety that uverbs_uobject_fd_release() will * continue to work. Drivers using FD are responsible to * handle disassociation of the device on their own. */ if (WARN_ON(is_driver && obj->type_attrs->type_class != &uverbs_idr_class && obj->type_attrs->type_class != &uverbs_fd_class)) return -EINVAL; } if (!obj->methods) return 0; for (i = 0; i != obj->num_methods; i++) { const struct uverbs_method_def *method = (*obj->methods)[i]; if (!method) continue; rc = uapi_merge_method(uapi, obj_elm, obj_key, method, is_driver); if (rc) return rc; } return 0; } static int uapi_disable_elm(struct uverbs_api *uapi, const struct uapi_definition *def, u32 obj_key, u32 method_key) { bool exists; if (def->scope == UAPI_SCOPE_OBJECT) { struct uverbs_api_object *obj_elm; obj_elm = uapi_add_get_elm( uapi, obj_key, sizeof(*obj_elm), &exists); if (IS_ERR(obj_elm)) return PTR_ERR(obj_elm); obj_elm->disabled = 1; return 0; } if (def->scope == UAPI_SCOPE_METHOD && uapi_key_is_ioctl_method(method_key)) { struct uverbs_api_ioctl_method *method_elm; method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm), &exists); if (IS_ERR(method_elm)) return PTR_ERR(method_elm); method_elm->disabled = 1; return 0; } if (def->scope == UAPI_SCOPE_METHOD && (uapi_key_is_write_method(method_key) || uapi_key_is_write_ex_method(method_key))) { struct uverbs_api_write_method *write_elm; write_elm = uapi_add_get_elm(uapi, method_key, sizeof(*write_elm), &exists); if (IS_ERR(write_elm)) return PTR_ERR(write_elm); write_elm->disabled = 1; return 0; } WARN_ON(true); return -EINVAL; } static int uapi_merge_def(struct uverbs_api *uapi, struct ib_device *ibdev, const struct uapi_definition *def_list, bool is_driver) { const struct uapi_definition *def = def_list; u32 cur_obj_key = UVERBS_API_KEY_ERR; u32 cur_method_key = UVERBS_API_KEY_ERR; bool exists; int rc; if (!def_list) return 0; for (;; def++) { switch ((enum uapi_definition_kind)def->kind) { case UAPI_DEF_CHAIN: rc = uapi_merge_def(uapi, ibdev, def->chain, is_driver); if (rc) return rc; continue; case UAPI_DEF_CHAIN_OBJ_TREE: if (WARN_ON(def->object_start.object_id != def->chain_obj_tree->id)) return -EINVAL; cur_obj_key = uapi_key_obj(def->object_start.object_id); rc = uapi_merge_obj_tree(uapi, def->chain_obj_tree, is_driver); if (rc) return rc; continue; case UAPI_DEF_END: return 0; case UAPI_DEF_IS_SUPPORTED_DEV_FN: { void **ibdev_fn = (void *)(&ibdev->ops) + def->needs_fn_offset; if (*ibdev_fn) continue; rc = uapi_disable_elm( uapi, def, cur_obj_key, cur_method_key); if (rc) return rc; continue; } case UAPI_DEF_IS_SUPPORTED_FUNC: if (def->func_is_supported(ibdev)) continue; rc = uapi_disable_elm( uapi, def, cur_obj_key, cur_method_key); if (rc) return rc; continue; case UAPI_DEF_OBJECT_START: { struct uverbs_api_object *obj_elm; cur_obj_key = uapi_key_obj(def->object_start.object_id); obj_elm = uapi_add_get_elm(uapi, cur_obj_key, sizeof(*obj_elm), &exists); if (IS_ERR(obj_elm)) return PTR_ERR(obj_elm); continue; } case UAPI_DEF_WRITE: rc = uapi_create_write( uapi, ibdev, def, cur_obj_key, &cur_method_key); if (rc) return rc; continue; } WARN_ON(true); return -EINVAL; } } static int uapi_finalize_ioctl_method(struct uverbs_api *uapi, struct uverbs_api_ioctl_method *method_elm, u32 method_key) { struct radix_tree_iter iter; unsigned int num_attrs = 0; unsigned int max_bkey = 0; bool single_uobj = false; void __rcu **slot; method_elm->destroy_bkey = UVERBS_API_ATTR_BKEY_LEN; radix_tree_for_each_slot (slot, &uapi->radix, &iter, uapi_key_attrs_start(method_key)) { struct uverbs_api_attr *elm = rcu_dereference_protected(*slot, true); u32 attr_key = iter.index & UVERBS_API_ATTR_KEY_MASK; u32 attr_bkey = uapi_bkey_attr(attr_key); u8 type = elm->spec.type; if (uapi_key_attr_to_ioctl_method(iter.index) != uapi_key_attr_to_ioctl_method(method_key)) break; if (elm->spec.mandatory) __set_bit(attr_bkey, method_elm->attr_mandatory); if (elm->spec.is_udata) method_elm->has_udata = true; if (type == UVERBS_ATTR_TYPE_IDR || type == UVERBS_ATTR_TYPE_FD) { u8 access = elm->spec.u.obj.access; /* * Verbs specs may only have one NEW/DESTROY, we don't * have the infrastructure to abort multiple NEW's or * cope with multiple DESTROY failure. */ if (access == UVERBS_ACCESS_NEW || access == UVERBS_ACCESS_DESTROY) { if (WARN_ON(single_uobj)) return -EINVAL; single_uobj = true; if (WARN_ON(!elm->spec.mandatory)) return -EINVAL; } if (access == UVERBS_ACCESS_DESTROY) method_elm->destroy_bkey = attr_bkey; } max_bkey = max(max_bkey, attr_bkey); num_attrs++; } method_elm->key_bitmap_len = max_bkey + 1; WARN_ON(method_elm->key_bitmap_len > UVERBS_API_ATTR_BKEY_LEN); uapi_compute_bundle_size(method_elm, num_attrs); return 0; } static int uapi_finalize(struct uverbs_api *uapi) { const struct uverbs_api_write_method **data; unsigned long max_write_ex = 0; unsigned long max_write = 0; struct radix_tree_iter iter; void __rcu **slot; int rc; int i; radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { struct uverbs_api_ioctl_method *method_elm = rcu_dereference_protected(*slot, true); if (uapi_key_is_ioctl_method(iter.index)) { rc = uapi_finalize_ioctl_method(uapi, method_elm, iter.index); if (rc) return rc; } if (uapi_key_is_write_method(iter.index)) max_write = max(max_write, iter.index & UVERBS_API_ATTR_KEY_MASK); if (uapi_key_is_write_ex_method(iter.index)) max_write_ex = max(max_write_ex, iter.index & UVERBS_API_ATTR_KEY_MASK); } uapi->notsupp_method.handler = ib_uverbs_notsupp; uapi->num_write = max_write + 1; uapi->num_write_ex = max_write_ex + 1; data = kmalloc_array(uapi->num_write + uapi->num_write_ex, sizeof(*uapi->write_methods), GFP_KERNEL); if (!data) return -ENOMEM; for (i = 0; i != uapi->num_write + uapi->num_write_ex; i++) data[i] = &uapi->notsupp_method; uapi->write_methods = data; uapi->write_ex_methods = data + uapi->num_write; radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { if (uapi_key_is_write_method(iter.index)) uapi->write_methods[iter.index & UVERBS_API_ATTR_KEY_MASK] = rcu_dereference_protected(*slot, true); if (uapi_key_is_write_ex_method(iter.index)) uapi->write_ex_methods[iter.index & UVERBS_API_ATTR_KEY_MASK] = rcu_dereference_protected(*slot, true); } return 0; } static void uapi_remove_range(struct uverbs_api *uapi, u32 start, u32 last) { struct radix_tree_iter iter; void __rcu **slot; radix_tree_for_each_slot (slot, &uapi->radix, &iter, start) { if (iter.index > last) return; kfree(rcu_dereference_protected(*slot, true)); radix_tree_iter_delete(&uapi->radix, &iter, slot); } } static void uapi_remove_object(struct uverbs_api *uapi, u32 obj_key) { uapi_remove_range(uapi, obj_key, obj_key | UVERBS_API_METHOD_KEY_MASK | UVERBS_API_ATTR_KEY_MASK); } static void uapi_remove_method(struct uverbs_api *uapi, u32 method_key) { uapi_remove_range(uapi, method_key, method_key | UVERBS_API_ATTR_KEY_MASK); } static u32 uapi_get_obj_id(struct uverbs_attr_spec *spec) { if (spec->type == UVERBS_ATTR_TYPE_IDR || spec->type == UVERBS_ATTR_TYPE_FD) return spec->u.obj.obj_type; if (spec->type == UVERBS_ATTR_TYPE_IDRS_ARRAY) return spec->u2.objs_arr.obj_type; return UVERBS_API_KEY_ERR; } static void uapi_key_okay(u32 key) { unsigned int count = 0; if (uapi_key_is_object(key)) count++; if (uapi_key_is_ioctl_method(key)) count++; if (uapi_key_is_write_method(key)) count++; if (uapi_key_is_write_ex_method(key)) count++; if (uapi_key_is_attr(key)) count++; WARN(count != 1, "Bad count %u key=%x", count, key); } static void uapi_finalize_disable(struct uverbs_api *uapi) { struct radix_tree_iter iter; u32 starting_key = 0; bool scan_again = false; void __rcu **slot; again: radix_tree_for_each_slot (slot, &uapi->radix, &iter, starting_key) { uapi_key_okay(iter.index); if (uapi_key_is_object(iter.index)) { struct uverbs_api_object *obj_elm = rcu_dereference_protected(*slot, true); if (obj_elm->disabled) { /* Have to check all the attrs again */ scan_again = true; starting_key = iter.index; uapi_remove_object(uapi, iter.index); goto again; } continue; } if (uapi_key_is_ioctl_method(iter.index)) { struct uverbs_api_ioctl_method *method_elm = rcu_dereference_protected(*slot, true); if (method_elm->disabled) { starting_key = iter.index; uapi_remove_method(uapi, iter.index); goto again; } continue; } if (uapi_key_is_write_method(iter.index) || uapi_key_is_write_ex_method(iter.index)) { struct uverbs_api_write_method *method_elm = rcu_dereference_protected(*slot, true); if (method_elm->disabled) { kfree(method_elm); radix_tree_iter_delete(&uapi->radix, &iter, slot); } continue; } if (uapi_key_is_attr(iter.index)) { struct uverbs_api_attr *attr_elm = rcu_dereference_protected(*slot, true); const struct uverbs_api_object *tmp_obj; u32 obj_key; /* * If the method has a mandatory object handle * attribute which relies on an object which is not * present then the entire method is uncallable. */ if (!attr_elm->spec.mandatory) continue; obj_key = uapi_get_obj_id(&attr_elm->spec); if (obj_key == UVERBS_API_KEY_ERR) continue; tmp_obj = uapi_get_object(uapi, obj_key); if (IS_ERR(tmp_obj)) { if (PTR_ERR(tmp_obj) == -ENOMSG) continue; } else { if (!tmp_obj->disabled) continue; } starting_key = iter.index; uapi_remove_method( uapi, iter.index & (UVERBS_API_OBJ_KEY_MASK | UVERBS_API_METHOD_KEY_MASK)); goto again; } WARN_ON(false); } if (!scan_again) return; scan_again = false; starting_key = 0; goto again; } void uverbs_destroy_api(struct uverbs_api *uapi) { if (!uapi) return; uapi_remove_range(uapi, 0, U32_MAX); kfree(uapi->write_methods); kfree(uapi); } static const struct uapi_definition uverbs_core_api[] = { UAPI_DEF_CHAIN(uverbs_def_obj_async_fd), UAPI_DEF_CHAIN(uverbs_def_obj_counters), UAPI_DEF_CHAIN(uverbs_def_obj_cq), UAPI_DEF_CHAIN(uverbs_def_obj_device), UAPI_DEF_CHAIN(uverbs_def_obj_dm), UAPI_DEF_CHAIN(uverbs_def_obj_dmah), UAPI_DEF_CHAIN(uverbs_def_obj_flow_action), UAPI_DEF_CHAIN(uverbs_def_obj_intf), UAPI_DEF_CHAIN(uverbs_def_obj_mr), UAPI_DEF_CHAIN(uverbs_def_obj_qp), UAPI_DEF_CHAIN(uverbs_def_obj_srq), UAPI_DEF_CHAIN(uverbs_def_obj_wq), UAPI_DEF_CHAIN(uverbs_def_write_intf), {}, }; struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev) { struct uverbs_api *uapi; int rc; uapi = kzalloc(sizeof(*uapi), GFP_KERNEL); if (!uapi) return ERR_PTR(-ENOMEM); INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL); uapi->driver_id = ibdev->ops.driver_id; rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false); if (rc) goto err; rc = uapi_merge_def(uapi, ibdev, ibdev->driver_def, true); if (rc) goto err; uapi_finalize_disable(uapi); rc = uapi_finalize(uapi); if (rc) goto err; return uapi; err: if (rc != -ENOMEM) dev_err(&ibdev->dev, "Setup of uverbs_api failed, kernel parsing tree description is not valid (%d)??\n", rc); uverbs_destroy_api(uapi); return ERR_PTR(rc); } /* * The pre version is done before destroying the HW objects, it only blocks * off method access. All methods that require the ib_dev or the module data * must test one of these assignments prior to continuing. */ void uverbs_disassociate_api_pre(struct ib_uverbs_device *uverbs_dev) { struct uverbs_api *uapi = uverbs_dev->uapi; struct radix_tree_iter iter; void __rcu **slot; rcu_assign_pointer(uverbs_dev->ib_dev, NULL); radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { if (uapi_key_is_ioctl_method(iter.index)) { struct uverbs_api_ioctl_method *method_elm = rcu_dereference_protected(*slot, true); if (method_elm->driver_method) rcu_assign_pointer(method_elm->handler, NULL); } } synchronize_srcu(&uverbs_dev->disassociate_srcu); } /* * Called when a driver disassociates from the ib_uverbs_device. The * assumption is that the driver module will unload after. Replace everything * related to the driver with NULL as a safety measure. */ void uverbs_disassociate_api(struct uverbs_api *uapi) { struct radix_tree_iter iter; void __rcu **slot; radix_tree_for_each_slot (slot, &uapi->radix, &iter, 0) { if (uapi_key_is_object(iter.index)) { struct uverbs_api_object *object_elm = rcu_dereference_protected(*slot, true); /* * Some type_attrs are in the driver module. We don't * bother to keep track of which since there should be * no use of this after disassociate. */ object_elm->type_attrs = NULL; } else if (uapi_key_is_attr(iter.index)) { struct uverbs_api_attr *elm = rcu_dereference_protected(*slot, true); if (elm->spec.type == UVERBS_ATTR_TYPE_ENUM_IN) elm->spec.u2.enum_def.ids = NULL; } } }
416 416 416 416 416 416 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) ST-Ericsson AB 2010 * Author: Sjur Brendeland */ #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/rculist.h> #include <net/caif/cfpkt.h> #include <net/caif/cfmuxl.h> #include <net/caif/cfsrvl.h> #include <net/caif/cffrml.h> #define container_obj(layr) container_of(layr, struct cfmuxl, layer) #define CAIF_CTRL_CHANNEL 0 #define UP_CACHE_SIZE 8 #define DN_CACHE_SIZE 8 struct cfmuxl { struct cflayer layer; struct list_head srvl_list; struct list_head frml_list; struct cflayer *up_cache[UP_CACHE_SIZE]; struct cflayer *dn_cache[DN_CACHE_SIZE]; /* * Set when inserting or removing downwards layers. */ spinlock_t transmit_lock; /* * Set when inserting or removing upwards layers. */ spinlock_t receive_lock; }; static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt); static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt); static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid); static struct cflayer *get_up(struct cfmuxl *muxl, u16 id); struct cflayer *cfmuxl_create(void) { struct cfmuxl *this = kzalloc(sizeof(struct cfmuxl), GFP_ATOMIC); if (!this) return NULL; this->layer.receive = cfmuxl_receive; this->layer.transmit = cfmuxl_transmit; this->layer.ctrlcmd = cfmuxl_ctrlcmd; INIT_LIST_HEAD(&this->srvl_list); INIT_LIST_HEAD(&this->frml_list); spin_lock_init(&this->transmit_lock); spin_lock_init(&this->receive_lock); snprintf(this->layer.name, CAIF_LAYER_NAME_SZ, "mux"); return &this->layer; } int cfmuxl_set_dnlayer(struct cflayer *layr, struct cflayer *dn, u8 phyid) { struct cfmuxl *muxl = (struct cfmuxl *) layr; spin_lock_bh(&muxl->transmit_lock); list_add_rcu(&dn->node, &muxl->frml_list); spin_unlock_bh(&muxl->transmit_lock); return 0; } static struct cflayer *get_from_id(struct list_head *list, u16 id) { struct cflayer *lyr; list_for_each_entry_rcu(lyr, list, node) { if (lyr->id == id) return lyr; } return NULL; } int cfmuxl_set_uplayer(struct cflayer *layr, struct cflayer *up, u8 linkid) { struct cfmuxl *muxl = container_obj(layr); struct cflayer *old; spin_lock_bh(&muxl->receive_lock); /* Two entries with same id is wrong, so remove old layer from mux */ old = get_from_id(&muxl->srvl_list, linkid); if (old != NULL) list_del_rcu(&old->node); list_add_rcu(&up->node, &muxl->srvl_list); spin_unlock_bh(&muxl->receive_lock); return 0; } struct cflayer *cfmuxl_remove_dnlayer(struct cflayer *layr, u8 phyid) { struct cfmuxl *muxl = container_obj(layr); struct cflayer *dn; int idx = phyid % DN_CACHE_SIZE; spin_lock_bh(&muxl->transmit_lock); RCU_INIT_POINTER(muxl->dn_cache[idx], NULL); dn = get_from_id(&muxl->frml_list, phyid); if (dn == NULL) goto out; list_del_rcu(&dn->node); caif_assert(dn != NULL); out: spin_unlock_bh(&muxl->transmit_lock); return dn; } static struct cflayer *get_up(struct cfmuxl *muxl, u16 id) { struct cflayer *up; int idx = id % UP_CACHE_SIZE; up = rcu_dereference(muxl->up_cache[idx]); if (up == NULL || up->id != id) { spin_lock_bh(&muxl->receive_lock); up = get_from_id(&muxl->srvl_list, id); rcu_assign_pointer(muxl->up_cache[idx], up); spin_unlock_bh(&muxl->receive_lock); } return up; } static struct cflayer *get_dn(struct cfmuxl *muxl, struct dev_info *dev_info) { struct cflayer *dn; int idx = dev_info->id % DN_CACHE_SIZE; dn = rcu_dereference(muxl->dn_cache[idx]); if (dn == NULL || dn->id != dev_info->id) { spin_lock_bh(&muxl->transmit_lock); dn = get_from_id(&muxl->frml_list, dev_info->id); rcu_assign_pointer(muxl->dn_cache[idx], dn); spin_unlock_bh(&muxl->transmit_lock); } return dn; } struct cflayer *cfmuxl_remove_uplayer(struct cflayer *layr, u8 id) { struct cflayer *up; struct cfmuxl *muxl = container_obj(layr); int idx = id % UP_CACHE_SIZE; if (id == 0) { pr_warn("Trying to remove control layer\n"); return NULL; } spin_lock_bh(&muxl->receive_lock); up = get_from_id(&muxl->srvl_list, id); if (up == NULL) goto out; RCU_INIT_POINTER(muxl->up_cache[idx], NULL); list_del_rcu(&up->node); out: spin_unlock_bh(&muxl->receive_lock); return up; } static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt) { int ret; struct cfmuxl *muxl = container_obj(layr); u8 id; struct cflayer *up; if (cfpkt_extr_head(pkt, &id, 1) < 0) { pr_err("erroneous Caif Packet\n"); cfpkt_destroy(pkt); return -EPROTO; } rcu_read_lock(); up = get_up(muxl, id); if (up == NULL) { pr_debug("Received data on unknown link ID = %d (0x%x)" " up == NULL", id, id); cfpkt_destroy(pkt); /* * Don't return ERROR, since modem misbehaves and sends out * flow on before linksetup response. */ rcu_read_unlock(); return /* CFGLU_EPROT; */ 0; } /* We can't hold rcu_lock during receive, so take a ref count instead */ cfsrvl_get(up); rcu_read_unlock(); ret = up->receive(up, pkt); cfsrvl_put(up); return ret; } static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt) { struct cfmuxl *muxl = container_obj(layr); int err; u8 linkid; struct cflayer *dn; struct caif_payload_info *info = cfpkt_info(pkt); BUG_ON(!info); rcu_read_lock(); dn = get_dn(muxl, info->dev_info); if (dn == NULL) { pr_debug("Send data on unknown phy ID = %d (0x%x)\n", info->dev_info->id, info->dev_info->id); rcu_read_unlock(); cfpkt_destroy(pkt); return -ENOTCONN; } info->hdr_len += 1; linkid = info->channel_id; cfpkt_add_head(pkt, &linkid, 1); /* We can't hold rcu_lock during receive, so take a ref count instead */ cffrml_hold(dn); rcu_read_unlock(); err = dn->transmit(dn, pkt); cffrml_put(dn); return err; } static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, int phyid) { struct cfmuxl *muxl = container_obj(layr); struct cflayer *layer; rcu_read_lock(); list_for_each_entry_rcu(layer, &muxl->srvl_list, node) { if (cfsrvl_phyid_match(layer, phyid) && layer->ctrlcmd) { if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND || ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) && layer->id != 0) cfmuxl_remove_uplayer(layr, layer->id); /* NOTE: ctrlcmd is not allowed to block */ layer->ctrlcmd(layer, ctrl, phyid); } } rcu_read_unlock(); }
55726 2 47612 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 /* SPDX-License-Identifier: GPL-2.0 */ /* Perform sanity checking for object sizes for uaccess.h and uio.h. */ #ifndef __LINUX_UCOPYSIZE_H__ #define __LINUX_UCOPYSIZE_H__ #include <linux/bug.h> #ifdef CONFIG_HARDENED_USERCOPY #include <linux/jump_label.h> extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); DECLARE_STATIC_KEY_MAYBE(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, validate_usercopy_range); static __always_inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { if (!__builtin_constant_p(n) && static_branch_maybe(CONFIG_HARDENED_USERCOPY_DEFAULT_ON, &validate_usercopy_range)) { __check_object_size(ptr, n, to_user); } } #else static inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { } #endif /* CONFIG_HARDENED_USERCOPY */ extern void __compiletime_error("copy source size is too small") __bad_copy_from(void); extern void __compiletime_error("copy destination size is too small") __bad_copy_to(void); void __copy_overflow(int size, unsigned long count); static inline void copy_overflow(int size, unsigned long count) { if (IS_ENABLED(CONFIG_BUG)) __copy_overflow(size, count); } static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { int sz = __builtin_object_size(addr, 0); if (unlikely(sz >= 0 && sz < bytes)) { if (!__builtin_constant_p(bytes)) copy_overflow(sz, bytes); else if (is_source) __bad_copy_from(); else __bad_copy_to(); return false; } if (WARN_ON_ONCE(bytes > INT_MAX)) return false; check_object_size(addr, bytes, is_source); return true; } #endif /* __LINUX_UCOPYSIZE_H__ */
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 // SPDX-License-Identifier: GPL-2.0 /* * PCI searching functions * * Copyright (C) 1993 -- 1997 Drew Eckhardt, Frederic Potter, * David Mosberger-Tang * Copyright (C) 1997 -- 2000 Martin Mares <mj@ucw.cz> * Copyright (C) 2003 -- 2004 Greg Kroah-Hartman <greg@kroah.com> */ #include <linux/pci.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/interrupt.h> #include "pci.h" DECLARE_RWSEM(pci_bus_sem); /* * pci_for_each_dma_alias - Iterate over DMA aliases for a device * @pdev: starting downstream device * @fn: function to call for each alias * @data: opaque data to pass to @fn * * Starting @pdev, walk up the bus calling @fn for each possible alias * of @pdev at the root bus. */ int pci_for_each_dma_alias(struct pci_dev *pdev, int (*fn)(struct pci_dev *pdev, u16 alias, void *data), void *data) { struct pci_bus *bus; int ret; /* * The device may have an explicit alias requester ID for DMA where the * requester is on another PCI bus. */ pdev = pci_real_dma_dev(pdev); ret = fn(pdev, pci_dev_id(pdev), data); if (ret) return ret; /* * If the device is broken and uses an alias requester ID for * DMA, iterate over that too. */ if (unlikely(pdev->dma_alias_mask)) { unsigned int devfn; for_each_set_bit(devfn, pdev->dma_alias_mask, MAX_NR_DEVFNS) { ret = fn(pdev, PCI_DEVID(pdev->bus->number, devfn), data); if (ret) return ret; } } for (bus = pdev->bus; !pci_is_root_bus(bus); bus = bus->parent) { struct pci_dev *tmp; /* Skip virtual buses */ if (!bus->self) continue; tmp = bus->self; /* stop at bridge where translation unit is associated */ if (tmp->dev_flags & PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT) return ret; /* * PCIe-to-PCI/X bridges alias transactions from downstream * devices using the subordinate bus number (PCI Express to * PCI/PCI-X Bridge Spec, rev 1.0, sec 2.3). For all cases * where the upstream bus is PCI/X we alias to the bridge * (there are various conditions in the previous reference * where the bridge may take ownership of transactions, even * when the secondary interface is PCI-X). */ if (pci_is_pcie(tmp)) { switch (pci_pcie_type(tmp)) { case PCI_EXP_TYPE_ROOT_PORT: case PCI_EXP_TYPE_UPSTREAM: case PCI_EXP_TYPE_DOWNSTREAM: continue; case PCI_EXP_TYPE_PCI_BRIDGE: ret = fn(tmp, PCI_DEVID(tmp->subordinate->number, PCI_DEVFN(0, 0)), data); if (ret) return ret; continue; case PCI_EXP_TYPE_PCIE_BRIDGE: ret = fn(tmp, pci_dev_id(tmp), data); if (ret) return ret; continue; } } else { if (tmp->dev_flags & PCI_DEV_FLAG_PCIE_BRIDGE_ALIAS) ret = fn(tmp, PCI_DEVID(tmp->subordinate->number, PCI_DEVFN(0, 0)), data); else ret = fn(tmp, pci_dev_id(tmp), data); if (ret) return ret; } } return ret; } static struct pci_bus *pci_do_find_bus(struct pci_bus *bus, unsigned char busnr) { struct pci_bus *child; struct pci_bus *tmp; if (bus->number == busnr) return bus; list_for_each_entry(tmp, &bus->children, node) { child = pci_do_find_bus(tmp, busnr); if (child) return child; } return NULL; } /** * pci_find_bus - locate PCI bus from a given domain and bus number * @domain: number of PCI domain to search * @busnr: number of desired PCI bus * * Given a PCI bus number and domain number, the desired PCI bus is located * in the global list of PCI buses. If the bus is found, a pointer to its * data structure is returned. If no bus is found, %NULL is returned. */ struct pci_bus *pci_find_bus(int domain, int busnr) { struct pci_bus *bus = NULL; struct pci_bus *tmp_bus; while ((bus = pci_find_next_bus(bus)) != NULL) { if (pci_domain_nr(bus) != domain) continue; tmp_bus = pci_do_find_bus(bus, busnr); if (tmp_bus) return tmp_bus; } return NULL; } EXPORT_SYMBOL(pci_find_bus); /** * pci_find_next_bus - begin or continue searching for a PCI bus * @from: Previous PCI bus found, or %NULL for new search. * * Iterates through the list of known PCI buses. A new search is * initiated by passing %NULL as the @from argument. Otherwise if * @from is not %NULL, searches continue from next device on the * global list. */ struct pci_bus *pci_find_next_bus(const struct pci_bus *from) { struct list_head *n; struct pci_bus *b = NULL; down_read(&pci_bus_sem); n = from ? from->node.next : pci_root_buses.next; if (n != &pci_root_buses) b = list_entry(n, struct pci_bus, node); up_read(&pci_bus_sem); return b; } EXPORT_SYMBOL(pci_find_next_bus); /** * pci_get_slot - locate PCI device for a given PCI slot * @bus: PCI bus on which desired PCI device resides * @devfn: encodes number of PCI slot in which the desired PCI * device resides and the logical device number within that slot * in case of multi-function devices. * * Given a PCI bus and slot/function number, the desired PCI device * is located in the list of PCI devices. * If the device is found, its reference count is increased and this * function returns a pointer to its data structure. The caller must * decrement the reference count by calling pci_dev_put(). * If no device is found, %NULL is returned. */ struct pci_dev *pci_get_slot(struct pci_bus *bus, unsigned int devfn) { struct pci_dev *dev; down_read(&pci_bus_sem); list_for_each_entry(dev, &bus->devices, bus_list) { if (dev->devfn == devfn) goto out; } dev = NULL; out: pci_dev_get(dev); up_read(&pci_bus_sem); return dev; } EXPORT_SYMBOL(pci_get_slot); /** * pci_get_domain_bus_and_slot - locate PCI device for a given PCI domain (segment), bus, and slot * @domain: PCI domain/segment on which the PCI device resides. * @bus: PCI bus on which desired PCI device resides * @devfn: encodes number of PCI slot in which the desired PCI device * resides and the logical device number within that slot in case of * multi-function devices. * * Given a PCI domain, bus, and slot/function number, the desired PCI * device is located in the list of PCI devices. If the device is * found, its reference count is increased and this function returns a * pointer to its data structure. The caller must decrement the * reference count by calling pci_dev_put(). If no device is found, * %NULL is returned. */ struct pci_dev *pci_get_domain_bus_and_slot(int domain, unsigned int bus, unsigned int devfn) { struct pci_dev *dev = NULL; for_each_pci_dev(dev) { if (pci_domain_nr(dev->bus) == domain && (dev->bus->number == bus && dev->devfn == devfn)) return dev; } return NULL; } EXPORT_SYMBOL(pci_get_domain_bus_and_slot); static int match_pci_dev_by_id(struct device *dev, const void *data) { struct pci_dev *pdev = to_pci_dev(dev); const struct pci_device_id *id = data; if (pci_match_one_device(id, pdev)) return 1; return 0; } /* * pci_get_dev_by_id - begin or continue searching for a PCI device by id * @id: pointer to struct pci_device_id to match for the device * @from: Previous PCI device found in search, or %NULL for new search. * * Iterates through the list of known PCI devices. If a PCI device is found * with a matching id a pointer to its device structure is returned, and the * reference count to the device is incremented. Otherwise, %NULL is returned. * A new search is initiated by passing %NULL as the @from argument. Otherwise * if @from is not %NULL, searches continue from next device on the global * list. The reference count for @from is always decremented if it is not * %NULL. * * This is an internal function for use by the other search functions in * this file. */ static struct pci_dev *pci_get_dev_by_id(const struct pci_device_id *id, struct pci_dev *from) { struct device *dev; struct device *dev_start = NULL; struct pci_dev *pdev = NULL; if (from) dev_start = &from->dev; dev = bus_find_device(&pci_bus_type, dev_start, (void *)id, match_pci_dev_by_id); if (dev) pdev = to_pci_dev(dev); pci_dev_put(from); return pdev; } /** * pci_get_subsys - begin or continue searching for a PCI device by vendor/subvendor/device/subdevice id * @vendor: PCI vendor id to match, or %PCI_ANY_ID to match all vendor ids * @device: PCI device id to match, or %PCI_ANY_ID to match all device ids * @ss_vendor: PCI subsystem vendor id to match, or %PCI_ANY_ID to match all vendor ids * @ss_device: PCI subsystem device id to match, or %PCI_ANY_ID to match all device ids * @from: Previous PCI device found in search, or %NULL for new search. * * Iterates through the list of known PCI devices. If a PCI device is found * with a matching @vendor, @device, @ss_vendor and @ss_device, a pointer to its * device structure is returned, and the reference count to the device is * incremented. Otherwise, %NULL is returned. A new search is initiated by * passing %NULL as the @from argument. Otherwise if @from is not %NULL, * searches continue from next device on the global list. * The reference count for @from is always decremented if it is not %NULL. */ struct pci_dev *pci_get_subsys(unsigned int vendor, unsigned int device, unsigned int ss_vendor, unsigned int ss_device, struct pci_dev *from) { struct pci_device_id id = { .vendor = vendor, .device = device, .subvendor = ss_vendor, .subdevice = ss_device, }; return pci_get_dev_by_id(&id, from); } EXPORT_SYMBOL(pci_get_subsys); /** * pci_get_device - begin or continue searching for a PCI device by vendor/device id * @vendor: PCI vendor id to match, or %PCI_ANY_ID to match all vendor ids * @device: PCI device id to match, or %PCI_ANY_ID to match all device ids * @from: Previous PCI device found in search, or %NULL for new search. * * Iterates through the list of known PCI devices. If a PCI device is * found with a matching @vendor and @device, the reference count to the * device is incremented and a pointer to its device structure is returned. * Otherwise, %NULL is returned. A new search is initiated by passing %NULL * as the @from argument. Otherwise if @from is not %NULL, searches continue * from next device on the global list. The reference count for @from is * always decremented if it is not %NULL. */ struct pci_dev *pci_get_device(unsigned int vendor, unsigned int device, struct pci_dev *from) { return pci_get_subsys(vendor, device, PCI_ANY_ID, PCI_ANY_ID, from); } EXPORT_SYMBOL(pci_get_device); /** * pci_get_class - begin or continue searching for a PCI device by class * @class: search for a PCI device with this class designation * @from: Previous PCI device found in search, or %NULL for new search. * * Iterates through the list of known PCI devices. If a PCI device is * found with a matching @class, the reference count to the device is * incremented and a pointer to its device structure is returned. * Otherwise, %NULL is returned. * A new search is initiated by passing %NULL as the @from argument. * Otherwise if @from is not %NULL, searches continue from next device * on the global list. The reference count for @from is always decremented * if it is not %NULL. */ struct pci_dev *pci_get_class(unsigned int class, struct pci_dev *from) { struct pci_device_id id = { .vendor = PCI_ANY_ID, .device = PCI_ANY_ID, .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, .class_mask = PCI_ANY_ID, .class = class, }; return pci_get_dev_by_id(&id, from); } EXPORT_SYMBOL(pci_get_class); /** * pci_get_base_class - searching for a PCI device by matching against the base class code only * @class: search for a PCI device with this base class code * @from: Previous PCI device found in search, or %NULL for new search. * * Iterates through the list of known PCI devices. If a PCI device is found * with a matching base class code, the reference count to the device is * incremented. See pci_match_one_device() to figure out how does this works. * A new search is initiated by passing %NULL as the @from argument. * Otherwise if @from is not %NULL, searches continue from next device on the * global list. The reference count for @from is always decremented if it is * not %NULL. * * Returns: * A pointer to a matched PCI device, %NULL Otherwise. */ struct pci_dev *pci_get_base_class(unsigned int class, struct pci_dev *from) { struct pci_device_id id = { .vendor = PCI_ANY_ID, .device = PCI_ANY_ID, .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID, .class_mask = 0xFF0000, .class = class << 16, }; return pci_get_dev_by_id(&id, from); } EXPORT_SYMBOL(pci_get_base_class); /** * pci_dev_present - Returns 1 if device matching the device list is present, 0 if not. * @ids: A pointer to a null terminated list of struct pci_device_id structures * that describe the type of PCI device the caller is trying to find. * * Obvious fact: You do not have a reference to any device that might be found * by this function, so if that device is removed from the system right after * this function is finished, the value will be stale. Use this function to * find devices that are usually built into a system, or for a general hint as * to if another device happens to be present at this specific moment in time. */ int pci_dev_present(const struct pci_device_id *ids) { struct pci_dev *found = NULL; while (ids->vendor || ids->subvendor || ids->class_mask) { found = pci_get_dev_by_id(ids, NULL); if (found) { pci_dev_put(found); return 1; } ids++; } return 0; } EXPORT_SYMBOL(pci_dev_present);
14 14 17 14 3 6 1 14 6 8 15 15 1 14 3 34 34 6 5 1 3 3 3 3 3 3 28 28 28 36 34 14 1 1 4 8 8 3 7 7 36 1 1 3 5 28 34 2 23 14 10 14 8 9 15 9 9 2 9 1 24 4 4 4 4 4 4 4 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 // SPDX-License-Identifier: GPL-2.0-only /* * Berkeley Packet Filter based traffic classifier * * Might be used to classify traffic through flexible, user-defined and * possibly JIT-ed BPF filters for traffic control as an alternative to * ematches. * * (C) 2013 Daniel Borkmann <dborkman@redhat.com> */ #include <linux/module.h> #include <linux/types.h> #include <linux/skbuff.h> #include <linux/filter.h> #include <linux/bpf.h> #include <linux/idr.h> #include <net/rtnetlink.h> #include <net/pkt_cls.h> #include <net/sock.h> #include <net/tc_wrapper.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>"); MODULE_DESCRIPTION("TC BPF based classifier"); #define CLS_BPF_NAME_LEN 256 #define CLS_BPF_SUPPORTED_GEN_FLAGS \ (TCA_CLS_FLAGS_SKIP_HW | TCA_CLS_FLAGS_SKIP_SW) struct cls_bpf_head { struct list_head plist; struct idr handle_idr; struct rcu_head rcu; }; struct cls_bpf_prog { struct bpf_prog *filter; struct list_head link; struct tcf_result res; bool exts_integrated; u32 gen_flags; unsigned int in_hw_count; struct tcf_exts exts; u32 handle; u16 bpf_num_ops; struct sock_filter *bpf_ops; const char *bpf_name; struct tcf_proto *tp; struct rcu_work rwork; }; static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, [TCA_BPF_FLAGS] = { .type = NLA_U32 }, [TCA_BPF_FLAGS_GEN] = { .type = NLA_U32 }, [TCA_BPF_FD] = { .type = NLA_U32 }, [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, }; static int cls_bpf_exec_opcode(int code) { switch (code) { case TC_ACT_OK: case TC_ACT_SHOT: case TC_ACT_STOLEN: case TC_ACT_TRAP: case TC_ACT_REDIRECT: case TC_ACT_UNSPEC: return code; default: return TC_ACT_UNSPEC; } } TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct tcf_result *res) { struct cls_bpf_head *head = rcu_dereference_bh(tp->root); bool at_ingress = skb_at_tc_ingress(skb); struct cls_bpf_prog *prog; int ret = -1; list_for_each_entry_rcu(prog, &head->plist, link) { int filter_res; qdisc_skb_cb(skb)->tc_classid = prog->res.classid; if (tc_skip_sw(prog->gen_flags)) { filter_res = prog->exts_integrated ? TC_ACT_UNSPEC : 0; } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { bpf_compute_data_pointers(skb); filter_res = bpf_prog_run(prog->filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; if (prog->exts_integrated) { res->class = 0; res->classid = TC_H_MAJ(prog->res.classid) | qdisc_skb_cb(skb)->tc_classid; ret = cls_bpf_exec_opcode(filter_res); if (ret == TC_ACT_UNSPEC) continue; break; } if (filter_res == 0) continue; if (filter_res != -1) { res->class = 0; res->classid = filter_res; } else { *res = prog->res; } ret = tcf_exts_exec(skb, &prog->exts, res); if (ret < 0) continue; break; } return ret; } static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) { return !prog->bpf_ops; } static int cls_bpf_offload_cmd(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct cls_bpf_prog *oldprog, struct netlink_ext_ack *extack) { struct tcf_block *block = tp->chain->block; struct tc_cls_bpf_offload cls_bpf = {}; struct cls_bpf_prog *obj; bool skip_sw; int err; skip_sw = prog && tc_skip_sw(prog->gen_flags); obj = prog ?: oldprog; tc_cls_common_offload_init(&cls_bpf.common, tp, obj->gen_flags, extack); cls_bpf.command = TC_CLSBPF_OFFLOAD; cls_bpf.exts = &obj->exts; cls_bpf.prog = prog ? prog->filter : NULL; cls_bpf.oldprog = oldprog ? oldprog->filter : NULL; cls_bpf.name = obj->bpf_name; cls_bpf.exts_integrated = obj->exts_integrated; if (oldprog && prog) err = tc_setup_cb_replace(block, tp, TC_SETUP_CLSBPF, &cls_bpf, skip_sw, &oldprog->gen_flags, &oldprog->in_hw_count, &prog->gen_flags, &prog->in_hw_count, true); else if (prog) err = tc_setup_cb_add(block, tp, TC_SETUP_CLSBPF, &cls_bpf, skip_sw, &prog->gen_flags, &prog->in_hw_count, true); else err = tc_setup_cb_destroy(block, tp, TC_SETUP_CLSBPF, &cls_bpf, skip_sw, &oldprog->gen_flags, &oldprog->in_hw_count, true); if (prog && err) { cls_bpf_offload_cmd(tp, oldprog, prog, extack); return err; } if (prog && skip_sw && !(prog->gen_flags & TCA_CLS_FLAGS_IN_HW)) return -EINVAL; return 0; } static u32 cls_bpf_flags(u32 flags) { return flags & CLS_BPF_SUPPORTED_GEN_FLAGS; } static int cls_bpf_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct cls_bpf_prog *oldprog, struct netlink_ext_ack *extack) { if (prog && oldprog && cls_bpf_flags(prog->gen_flags) != cls_bpf_flags(oldprog->gen_flags)) return -EINVAL; if (prog && tc_skip_hw(prog->gen_flags)) prog = NULL; if (oldprog && tc_skip_hw(oldprog->gen_flags)) oldprog = NULL; if (!prog && !oldprog) return 0; return cls_bpf_offload_cmd(tp, prog, oldprog, extack); } static void cls_bpf_stop_offload(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct netlink_ext_ack *extack) { int err; err = cls_bpf_offload_cmd(tp, NULL, prog, extack); if (err) pr_err("Stopping hardware offload failed: %d\n", err); } static void cls_bpf_offload_update_stats(struct tcf_proto *tp, struct cls_bpf_prog *prog) { struct tcf_block *block = tp->chain->block; struct tc_cls_bpf_offload cls_bpf = {}; tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags, NULL); cls_bpf.command = TC_CLSBPF_STATS; cls_bpf.exts = &prog->exts; cls_bpf.prog = prog->filter; cls_bpf.name = prog->bpf_name; cls_bpf.exts_integrated = prog->exts_integrated; tc_setup_cb_call(block, TC_SETUP_CLSBPF, &cls_bpf, false, true); } static int cls_bpf_init(struct tcf_proto *tp) { struct cls_bpf_head *head; head = kzalloc(sizeof(*head), GFP_KERNEL); if (head == NULL) return -ENOBUFS; INIT_LIST_HEAD_RCU(&head->plist); idr_init(&head->handle_idr); rcu_assign_pointer(tp->root, head); return 0; } static void cls_bpf_free_parms(struct cls_bpf_prog *prog) { if (cls_bpf_is_ebpf(prog)) bpf_prog_put(prog->filter); else bpf_prog_destroy(prog->filter); kfree(prog->bpf_name); kfree(prog->bpf_ops); } static void __cls_bpf_delete_prog(struct cls_bpf_prog *prog) { tcf_exts_destroy(&prog->exts); tcf_exts_put_net(&prog->exts); cls_bpf_free_parms(prog); kfree(prog); } static void cls_bpf_delete_prog_work(struct work_struct *work) { struct cls_bpf_prog *prog = container_of(to_rcu_work(work), struct cls_bpf_prog, rwork); rtnl_lock(); __cls_bpf_delete_prog(prog); rtnl_unlock(); } static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); idr_remove(&head->handle_idr, prog->handle); cls_bpf_stop_offload(tp, prog, extack); list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); if (tcf_exts_get_net(&prog->exts)) tcf_queue_work(&prog->rwork, cls_bpf_delete_prog_work); else __cls_bpf_delete_prog(prog); } static int cls_bpf_delete(struct tcf_proto *tp, void *arg, bool *last, bool rtnl_held, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); __cls_bpf_delete(tp, arg, extack); *last = list_empty(&head->plist); return 0; } static void cls_bpf_destroy(struct tcf_proto *tp, bool rtnl_held, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog, *tmp; list_for_each_entry_safe(prog, tmp, &head->plist, link) __cls_bpf_delete(tp, prog, extack); idr_destroy(&head->handle_idr); kfree_rcu(head, rcu); } static void *cls_bpf_get(struct tcf_proto *tp, u32 handle) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog; list_for_each_entry(prog, &head->plist, link) { if (prog->handle == handle) return prog; } return NULL; } static int cls_bpf_prog_from_ops(struct nlattr **tb, struct cls_bpf_prog *prog) { struct sock_filter *bpf_ops; struct sock_fprog_kern fprog_tmp; struct bpf_prog *fp; u16 bpf_size, bpf_num_ops; int ret; bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]); if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) return -EINVAL; bpf_size = bpf_num_ops * sizeof(*bpf_ops); if (bpf_size != nla_len(tb[TCA_BPF_OPS])) return -EINVAL; bpf_ops = kmemdup(nla_data(tb[TCA_BPF_OPS]), bpf_size, GFP_KERNEL); if (bpf_ops == NULL) return -ENOMEM; fprog_tmp.len = bpf_num_ops; fprog_tmp.filter = bpf_ops; ret = bpf_prog_create(&fp, &fprog_tmp); if (ret < 0) { kfree(bpf_ops); return ret; } prog->bpf_ops = bpf_ops; prog->bpf_num_ops = bpf_num_ops; prog->bpf_name = NULL; prog->filter = fp; return 0; } static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, u32 gen_flags, const struct tcf_proto *tp) { struct bpf_prog *fp; char *name = NULL; bool skip_sw; u32 bpf_fd; bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); skip_sw = gen_flags & TCA_CLS_FLAGS_SKIP_SW; fp = bpf_prog_get_type_dev(bpf_fd, BPF_PROG_TYPE_SCHED_CLS, skip_sw); if (IS_ERR(fp)) return PTR_ERR(fp); if (tb[TCA_BPF_NAME]) { name = nla_memdup(tb[TCA_BPF_NAME], GFP_KERNEL); if (!name) { bpf_prog_put(fp); return -ENOMEM; } } prog->bpf_ops = NULL; prog->bpf_name = name; prog->filter = fp; if (fp->dst_needed) tcf_block_netif_keep_dst(tp->chain->block); return 0; } static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, void **arg, u32 flags, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); bool is_bpf, is_ebpf, have_exts = false; struct cls_bpf_prog *oldprog = *arg; struct nlattr *tb[TCA_BPF_MAX + 1]; bool bound_to_filter = false; struct cls_bpf_prog *prog; u32 gen_flags = 0; int ret; if (tca[TCA_OPTIONS] == NULL) return -EINVAL; ret = nla_parse_nested_deprecated(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy, NULL); if (ret < 0) return ret; prog = kzalloc(sizeof(*prog), GFP_KERNEL); if (!prog) return -ENOBUFS; ret = tcf_exts_init(&prog->exts, net, TCA_BPF_ACT, TCA_BPF_POLICE); if (ret < 0) goto errout; if (oldprog) { if (handle && oldprog->handle != handle) { ret = -EINVAL; goto errout; } } if (handle == 0) { handle = 1; ret = idr_alloc_u32(&head->handle_idr, prog, &handle, INT_MAX, GFP_KERNEL); } else if (!oldprog) { ret = idr_alloc_u32(&head->handle_idr, prog, &handle, handle, GFP_KERNEL); } if (ret) goto errout; prog->handle = handle; is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; is_ebpf = tb[TCA_BPF_FD]; if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) { ret = -EINVAL; goto errout_idr; } ret = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &prog->exts, flags, extack); if (ret < 0) goto errout_idr; if (tb[TCA_BPF_FLAGS]) { u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]); if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) { ret = -EINVAL; goto errout_idr; } have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; } if (tb[TCA_BPF_FLAGS_GEN]) { gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]); if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS || !tc_flags_valid(gen_flags)) { ret = -EINVAL; goto errout_idr; } } prog->exts_integrated = have_exts; prog->gen_flags = gen_flags; ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : cls_bpf_prog_from_efd(tb, prog, gen_flags, tp); if (ret < 0) goto errout_idr; if (tb[TCA_BPF_CLASSID]) { prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]); tcf_bind_filter(tp, &prog->res, base); bound_to_filter = true; } ret = cls_bpf_offload(tp, prog, oldprog, extack); if (ret) goto errout_parms; if (!tc_in_hw(prog->gen_flags)) prog->gen_flags |= TCA_CLS_FLAGS_NOT_IN_HW; tcf_proto_update_usesw(tp, prog->gen_flags); if (oldprog) { idr_replace(&head->handle_idr, prog, handle); list_replace_rcu(&oldprog->link, &prog->link); tcf_unbind_filter(tp, &oldprog->res); tcf_exts_get_net(&oldprog->exts); tcf_queue_work(&oldprog->rwork, cls_bpf_delete_prog_work); } else { list_add_rcu(&prog->link, &head->plist); } *arg = prog; return 0; errout_parms: if (bound_to_filter) tcf_unbind_filter(tp, &prog->res); cls_bpf_free_parms(prog); errout_idr: if (!oldprog) idr_remove(&head->handle_idr, prog->handle); errout: tcf_exts_destroy(&prog->exts); kfree(prog); return ret; } static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog, struct sk_buff *skb) { struct nlattr *nla; if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops)) return -EMSGSIZE; nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops * sizeof(struct sock_filter)); if (nla == NULL) return -EMSGSIZE; memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); return 0; } static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, struct sk_buff *skb) { struct nlattr *nla; if (prog->bpf_name && nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) return -EMSGSIZE; if (nla_put_u32(skb, TCA_BPF_ID, prog->filter->aux->id)) return -EMSGSIZE; nla = nla_reserve(skb, TCA_BPF_TAG, sizeof(prog->filter->tag)); if (nla == NULL) return -EMSGSIZE; memcpy(nla_data(nla), prog->filter->tag, nla_len(nla)); return 0; } static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *tm, bool rtnl_held) { struct cls_bpf_prog *prog = fh; struct nlattr *nest; u32 bpf_flags = 0; int ret; if (prog == NULL) return skb->len; tm->tcm_handle = prog->handle; cls_bpf_offload_update_stats(tp, prog); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (nest == NULL) goto nla_put_failure; if (prog->res.classid && nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) goto nla_put_failure; if (cls_bpf_is_ebpf(prog)) ret = cls_bpf_dump_ebpf_info(prog, skb); else ret = cls_bpf_dump_bpf_info(prog, skb); if (ret) goto nla_put_failure; if (tcf_exts_dump(skb, &prog->exts) < 0) goto nla_put_failure; if (prog->exts_integrated) bpf_flags |= TCA_BPF_FLAG_ACT_DIRECT; if (bpf_flags && nla_put_u32(skb, TCA_BPF_FLAGS, bpf_flags)) goto nla_put_failure; if (prog->gen_flags && nla_put_u32(skb, TCA_BPF_FLAGS_GEN, prog->gen_flags)) goto nla_put_failure; nla_nest_end(skb, nest); if (tcf_exts_dump_stats(skb, &prog->exts) < 0) goto nla_put_failure; return skb->len; nla_put_failure: nla_nest_cancel(skb, nest); return -1; } static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl, void *q, unsigned long base) { struct cls_bpf_prog *prog = fh; tc_cls_bind_class(classid, cl, q, &prog->res, base); } static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg, bool rtnl_held) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog; list_for_each_entry(prog, &head->plist, link) { if (!tc_cls_stats_dump(tp, arg, prog)) break; } } static int cls_bpf_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, void *cb_priv, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct tcf_block *block = tp->chain->block; struct tc_cls_bpf_offload cls_bpf = {}; struct cls_bpf_prog *prog; int err; list_for_each_entry(prog, &head->plist, link) { if (tc_skip_hw(prog->gen_flags)) continue; tc_cls_common_offload_init(&cls_bpf.common, tp, prog->gen_flags, extack); cls_bpf.command = TC_CLSBPF_OFFLOAD; cls_bpf.exts = &prog->exts; cls_bpf.prog = add ? prog->filter : NULL; cls_bpf.oldprog = add ? NULL : prog->filter; cls_bpf.name = prog->bpf_name; cls_bpf.exts_integrated = prog->exts_integrated; err = tc_setup_cb_reoffload(block, tp, add, cb, TC_SETUP_CLSBPF, &cls_bpf, cb_priv, &prog->gen_flags, &prog->in_hw_count); if (err) return err; } return 0; } static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .kind = "bpf", .owner = THIS_MODULE, .classify = cls_bpf_classify, .init = cls_bpf_init, .destroy = cls_bpf_destroy, .get = cls_bpf_get, .change = cls_bpf_change, .delete = cls_bpf_delete, .walk = cls_bpf_walk, .reoffload = cls_bpf_reoffload, .dump = cls_bpf_dump, .bind_class = cls_bpf_bind_class, }; MODULE_ALIAS_NET_CLS("bpf"); static int __init cls_bpf_init_mod(void) { return register_tcf_proto_ops(&cls_bpf_ops); } static void __exit cls_bpf_exit_mod(void) { unregister_tcf_proto_ops(&cls_bpf_ops); } module_init(cls_bpf_init_mod); module_exit(cls_bpf_exit_mod);
24 9 1 1 13 3 7 3 3 1 2 8 2 8 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 // SPDX-License-Identifier: GPL-2.0 #include <linux/perf_event.h> #include <linux/sysfs.h> #include <linux/nospec.h> #include <asm/cpu_device_id.h> #include <asm/msr.h> #include "probe.h" enum perf_msr_id { PERF_MSR_TSC = 0, PERF_MSR_APERF = 1, PERF_MSR_MPERF = 2, PERF_MSR_PPERF = 3, PERF_MSR_SMI = 4, PERF_MSR_PTSC = 5, PERF_MSR_IRPERF = 6, PERF_MSR_THERM = 7, PERF_MSR_EVENT_MAX, }; static bool test_aperfmperf(int idx, void *data) { return boot_cpu_has(X86_FEATURE_APERFMPERF); } static bool test_ptsc(int idx, void *data) { return boot_cpu_has(X86_FEATURE_PTSC); } static bool test_irperf(int idx, void *data) { return boot_cpu_has(X86_FEATURE_IRPERF); } static bool test_therm_status(int idx, void *data) { return boot_cpu_has(X86_FEATURE_DTHERM); } static bool test_intel(int idx, void *data) { if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || boot_cpu_data.x86 != 6) return false; switch (boot_cpu_data.x86_vfm) { case INTEL_NEHALEM: case INTEL_NEHALEM_G: case INTEL_NEHALEM_EP: case INTEL_NEHALEM_EX: case INTEL_WESTMERE: case INTEL_WESTMERE_EP: case INTEL_WESTMERE_EX: case INTEL_SANDYBRIDGE: case INTEL_SANDYBRIDGE_X: case INTEL_IVYBRIDGE: case INTEL_IVYBRIDGE_X: case INTEL_HASWELL: case INTEL_HASWELL_X: case INTEL_HASWELL_L: case INTEL_HASWELL_G: case INTEL_BROADWELL: case INTEL_BROADWELL_D: case INTEL_BROADWELL_G: case INTEL_BROADWELL_X: case INTEL_SAPPHIRERAPIDS_X: case INTEL_EMERALDRAPIDS_X: case INTEL_GRANITERAPIDS_X: case INTEL_GRANITERAPIDS_D: case INTEL_ATOM_SILVERMONT: case INTEL_ATOM_SILVERMONT_D: case INTEL_ATOM_AIRMONT: case INTEL_ATOM_GOLDMONT: case INTEL_ATOM_GOLDMONT_D: case INTEL_ATOM_GOLDMONT_PLUS: case INTEL_ATOM_TREMONT_D: case INTEL_ATOM_TREMONT: case INTEL_ATOM_TREMONT_L: case INTEL_XEON_PHI_KNL: case INTEL_XEON_PHI_KNM: if (idx == PERF_MSR_SMI) return true; break; case INTEL_SKYLAKE_L: case INTEL_SKYLAKE: case INTEL_SKYLAKE_X: case INTEL_KABYLAKE_L: case INTEL_KABYLAKE: case INTEL_COMETLAKE_L: case INTEL_COMETLAKE: case INTEL_ICELAKE_L: case INTEL_ICELAKE: case INTEL_ICELAKE_X: case INTEL_ICELAKE_D: case INTEL_TIGERLAKE_L: case INTEL_TIGERLAKE: case INTEL_ROCKETLAKE: case INTEL_ALDERLAKE: case INTEL_ALDERLAKE_L: case INTEL_ATOM_GRACEMONT: case INTEL_RAPTORLAKE: case INTEL_RAPTORLAKE_P: case INTEL_RAPTORLAKE_S: case INTEL_METEORLAKE: case INTEL_METEORLAKE_L: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; } return false; } PMU_EVENT_ATTR_STRING(tsc, attr_tsc, "event=0x00" ); PMU_EVENT_ATTR_STRING(aperf, attr_aperf, "event=0x01" ); PMU_EVENT_ATTR_STRING(mperf, attr_mperf, "event=0x02" ); PMU_EVENT_ATTR_STRING(pperf, attr_pperf, "event=0x03" ); PMU_EVENT_ATTR_STRING(smi, attr_smi, "event=0x04" ); PMU_EVENT_ATTR_STRING(ptsc, attr_ptsc, "event=0x05" ); PMU_EVENT_ATTR_STRING(irperf, attr_irperf, "event=0x06" ); PMU_EVENT_ATTR_STRING(cpu_thermal_margin, attr_therm, "event=0x07" ); PMU_EVENT_ATTR_STRING(cpu_thermal_margin.snapshot, attr_therm_snap, "1" ); PMU_EVENT_ATTR_STRING(cpu_thermal_margin.unit, attr_therm_unit, "C" ); static unsigned long msr_mask; PMU_EVENT_GROUP(events, aperf); PMU_EVENT_GROUP(events, mperf); PMU_EVENT_GROUP(events, pperf); PMU_EVENT_GROUP(events, smi); PMU_EVENT_GROUP(events, ptsc); PMU_EVENT_GROUP(events, irperf); static struct attribute *attrs_therm[] = { &attr_therm.attr.attr, &attr_therm_snap.attr.attr, &attr_therm_unit.attr.attr, NULL, }; static struct attribute_group group_therm = { .name = "events", .attrs = attrs_therm, }; static struct perf_msr msr[] = { [PERF_MSR_TSC] = { .no_check = true, }, [PERF_MSR_APERF] = { MSR_IA32_APERF, &group_aperf, test_aperfmperf, }, [PERF_MSR_MPERF] = { MSR_IA32_MPERF, &group_mperf, test_aperfmperf, }, [PERF_MSR_PPERF] = { MSR_PPERF, &group_pperf, test_intel, }, [PERF_MSR_SMI] = { MSR_SMI_COUNT, &group_smi, test_intel, }, [PERF_MSR_PTSC] = { MSR_F15H_PTSC, &group_ptsc, test_ptsc, }, [PERF_MSR_IRPERF] = { MSR_F17H_IRPERF, &group_irperf, test_irperf, }, [PERF_MSR_THERM] = { MSR_IA32_THERM_STATUS, &group_therm, test_therm_status, }, }; static struct attribute *events_attrs[] = { &attr_tsc.attr.attr, NULL, }; static struct attribute_group events_attr_group = { .name = "events", .attrs = events_attrs, }; PMU_FORMAT_ATTR(event, "config:0-63"); static struct attribute *format_attrs[] = { &format_attr_event.attr, NULL, }; static struct attribute_group format_attr_group = { .name = "format", .attrs = format_attrs, }; static const struct attribute_group *attr_groups[] = { &events_attr_group, &format_attr_group, NULL, }; static const struct attribute_group *attr_update[] = { &group_aperf, &group_mperf, &group_pperf, &group_smi, &group_ptsc, &group_irperf, &group_therm, NULL, }; static int msr_event_init(struct perf_event *event) { u64 cfg = event->attr.config; if (event->attr.type != event->pmu->type) return -ENOENT; /* unsupported modes and filters */ if (event->attr.sample_period) /* no sampling */ return -EINVAL; if (cfg >= PERF_MSR_EVENT_MAX) return -EINVAL; cfg = array_index_nospec((unsigned long)cfg, PERF_MSR_EVENT_MAX); if (!(msr_mask & (1 << cfg))) return -EINVAL; event->hw.idx = -1; event->hw.event_base = msr[cfg].msr; event->hw.config = cfg; return 0; } static inline u64 msr_read_counter(struct perf_event *event) { u64 now; if (event->hw.event_base) rdmsrq(event->hw.event_base, now); else now = rdtsc_ordered(); return now; } static void msr_event_update(struct perf_event *event) { u64 prev, now; s64 delta; /* Careful, an NMI might modify the previous event value: */ prev = local64_read(&event->hw.prev_count); do { now = msr_read_counter(event); } while (!local64_try_cmpxchg(&event->hw.prev_count, &prev, now)); delta = now - prev; if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) { delta = sign_extend64(delta, 31); local64_add(delta, &event->count); } else if (unlikely(event->hw.event_base == MSR_IA32_THERM_STATUS)) { /* If valid, extract digital readout, otherwise set to -1: */ now = now & (1ULL << 31) ? (now >> 16) & 0x3f : -1; local64_set(&event->count, now); } else { local64_add(delta, &event->count); } } static void msr_event_start(struct perf_event *event, int flags) { u64 now = msr_read_counter(event); local64_set(&event->hw.prev_count, now); } static void msr_event_stop(struct perf_event *event, int flags) { msr_event_update(event); } static void msr_event_del(struct perf_event *event, int flags) { msr_event_stop(event, PERF_EF_UPDATE); } static int msr_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) msr_event_start(event, flags); return 0; } static struct pmu pmu_msr = { .task_ctx_nr = perf_sw_context, .attr_groups = attr_groups, .event_init = msr_event_init, .add = msr_event_add, .del = msr_event_del, .start = msr_event_start, .stop = msr_event_stop, .read = msr_event_update, .capabilities = PERF_PMU_CAP_NO_INTERRUPT | PERF_PMU_CAP_NO_EXCLUDE, .attr_update = attr_update, }; static int __init msr_init(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) { pr_cont("no MSR PMU driver.\n"); return 0; } msr_mask = perf_msr_probe(msr, PERF_MSR_EVENT_MAX, true, NULL); perf_pmu_register(&pmu_msr, "msr", -1); return 0; } device_initcall(msr_init);
6 5 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_nfct.c: Netfilter connection tracking support for IPVS * * Portions Copyright (C) 2001-2002 * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. * * Portions Copyright (C) 2003-2010 * Julian Anastasov * * Authors: * Ben North <ben@redfrontdoor.org> * Julian Anastasov <ja@ssi.bg> Reorganize and sync with latest kernels * Hannes Eder <heder@google.com> Extend NFCT support for FTP, ipvs match * * Current status: * * - provide conntrack confirmation for new and related connections, by * this way we can see their proper conntrack state in all hooks * - support for all forwarding methods, not only NAT * - FTP support (NAT), ability to support other NAT apps with expectations * - to correctly create expectations for related NAT connections the proper * NF conntrack support must be already installed, eg. ip_vs_ftp requires * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables * NAT rules are needed) * - alter reply for NAT when forwarding packet in original direction: * conntrack from client in NEW or RELATED (Passive FTP DATA) state or * when RELATED conntrack is created from real server (Active FTP DATA) * - if iptables_nat is not loaded the Passive FTP will not work (the * PASV response can not be NAT-ed) but Active FTP should work */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/compiler.h> #include <linux/vmalloc.h> #include <linux/skbuff.h> #include <net/ip.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <net/ip_vs.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_expect.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_zones.h> #define FMT_TUPLE "%s:%u->%s:%u/%u" #define ARG_TUPLE(T) IP_VS_DBG_ADDR((T)->src.l3num, &(T)->src.u3), \ ntohs((T)->src.u.all), \ IP_VS_DBG_ADDR((T)->src.l3num, &(T)->dst.u3), \ ntohs((T)->dst.u.all), \ (T)->dst.protonum #define FMT_CONN "%s:%u->%s:%u->%s:%u/%u:%u" #define ARG_CONN(C) IP_VS_DBG_ADDR((C)->af, &((C)->caddr)), \ ntohs((C)->cport), \ IP_VS_DBG_ADDR((C)->af, &((C)->vaddr)), \ ntohs((C)->vport), \ IP_VS_DBG_ADDR((C)->daf, &((C)->daddr)), \ ntohs((C)->dport), \ (C)->protocol, (C)->state void ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) { enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); struct nf_conntrack_tuple new_tuple; if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) return; /* Never alter conntrack for non-NAT conns */ if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) return; /* Never alter conntrack for OPS conns (no reply is expected) */ if (cp->flags & IP_VS_CONN_F_ONE_PACKET) return; /* Alter reply only in original direction */ if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return; /* Applications may adjust TCP seqs */ if (cp->app && nf_ct_protonum(ct) == IPPROTO_TCP && !nfct_seqadj(ct) && !nfct_seqadj_ext_add(ct)) return; /* * The connection is not yet in the hashtable, so we update it. * CIP->VIP will remain the same, so leave the tuple in * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the * real-server we will see RIP->DIP. */ new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; /* * This will also take care of UDP and other protocols. */ if (outin) { new_tuple.src.u3 = cp->daddr; if (new_tuple.dst.protonum != IPPROTO_ICMP && new_tuple.dst.protonum != IPPROTO_ICMPV6) new_tuple.src.u.tcp.port = cp->dport; } else { new_tuple.dst.u3 = cp->vaddr; if (new_tuple.dst.protonum != IPPROTO_ICMP && new_tuple.dst.protonum != IPPROTO_ICMPV6) new_tuple.dst.u.tcp.port = cp->vport; } IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, " "ctinfo=%d, old reply=" FMT_TUPLE "\n", __func__, ct, ct->status, ctinfo, ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)); IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, " "ctinfo=%d, new reply=" FMT_TUPLE "\n", __func__, ct, ct->status, ctinfo, ARG_TUPLE(&new_tuple)); nf_conntrack_alter_reply(ct, &new_tuple); IP_VS_DBG_BUF(7, "%s: Updated conntrack ct=%p for cp=" FMT_CONN "\n", __func__, ct, ARG_CONN(cp)); } int ip_vs_confirm_conntrack(struct sk_buff *skb) { return nf_conntrack_confirm(skb); } /* * Called from init_conntrack() as expectfn handler. */ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, struct nf_conntrack_expect *exp) { struct nf_conntrack_tuple *orig, new_reply; struct ip_vs_conn *cp; struct ip_vs_conn_param p; struct net *net = nf_ct_net(ct); /* * We assume that no NF locks are held before this callback. * ip_vs_conn_out_get and ip_vs_conn_in_get should match their * expectations even if they use wildcard values, now we provide the * actual values from the newly created original conntrack direction. * The conntrack is confirmed when packet reaches IPVS hooks. */ /* RS->CLIENT */ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; ip_vs_conn_fill_param(net_ipvs(net), exp->tuple.src.l3num, orig->dst.protonum, &orig->src.u3, orig->src.u.tcp.port, &orig->dst.u3, orig->dst.u.tcp.port, &p); cp = ip_vs_conn_out_get(&p); if (cp) { /* Change reply CLIENT->RS to CLIENT->VS */ IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found inout cp=" FMT_CONN "\n", __func__, ct, ct->status, ARG_CONN(cp)); new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple=" FMT_TUPLE "\n", __func__, ct, ARG_TUPLE(&new_reply)); new_reply.dst.u3 = cp->vaddr; new_reply.dst.u.tcp.port = cp->vport; goto alter; } /* CLIENT->VS */ cp = ip_vs_conn_in_get(&p); if (cp) { /* Change reply VS->CLIENT to RS->CLIENT */ IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found outin cp=" FMT_CONN "\n", __func__, ct, ct->status, ARG_CONN(cp)); new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple=" FMT_TUPLE "\n", __func__, ct, ARG_TUPLE(&new_reply)); new_reply.src.u3 = cp->daddr; new_reply.src.u.tcp.port = cp->dport; goto alter; } IP_VS_DBG_BUF(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE " - unknown expect\n", __func__, ct, ct->status, ARG_TUPLE(orig)); return; alter: /* Never alter conntrack for non-NAT conns */ if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) nf_conntrack_alter_reply(ct, &new_reply); ip_vs_conn_put(cp); return; } /* * Create NF conntrack expectation with wildcard (optional) source port. * Then the default callback function will alter the reply and will confirm * the conntrack entry when the first packet comes. * Use port 0 to expect connection from any port. */ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, struct ip_vs_conn *cp, u_int8_t proto, const __be16 port, int from_rs) { struct nf_conntrack_expect *exp; if (ct == NULL) return; exp = nf_ct_expect_alloc(ct); if (!exp) return; nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), from_rs ? &cp->daddr : &cp->caddr, from_rs ? &cp->caddr : &cp->vaddr, proto, port ? &port : NULL, from_rs ? &cp->cport : &cp->vport); exp->expectfn = ip_vs_nfct_expect_callback; IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", __func__, ct, ARG_TUPLE(&exp->tuple)); nf_ct_expect_related(exp, 0); nf_ct_expect_put(exp); } EXPORT_SYMBOL(ip_vs_nfct_expect_related); /* * Our connection was terminated, try to drop the conntrack immediately */ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; struct nf_conntrack_tuple tuple; if (!cp->cport) return; tuple = (struct nf_conntrack_tuple) { .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; tuple.src.u3 = cp->caddr; tuple.src.u.all = cp->cport; tuple.src.l3num = cp->af; tuple.dst.u3 = cp->vaddr; tuple.dst.u.all = cp->vport; IP_VS_DBG_BUF(7, "%s: dropping conntrack for conn " FMT_CONN "\n", __func__, ARG_CONN(cp)); h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); if (nf_ct_kill(ct)) { IP_VS_DBG_BUF(7, "%s: ct=%p deleted for tuple=" FMT_TUPLE "\n", __func__, ct, ARG_TUPLE(&tuple)); } else { IP_VS_DBG_BUF(7, "%s: ct=%p, no conntrack for tuple=" FMT_TUPLE "\n", __func__, ct, ARG_TUPLE(&tuple)); } nf_ct_put(ct); } else { IP_VS_DBG_BUF(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", __func__, ARG_TUPLE(&tuple)); } }
13 9 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __PACKET_INTERNAL_H__ #define __PACKET_INTERNAL_H__ #include <linux/refcount.h> struct packet_mclist { struct packet_mclist *next; int ifindex; int count; unsigned short type; unsigned short alen; unsigned char addr[MAX_ADDR_LEN]; struct list_head remove_list; }; /* kbdq - kernel block descriptor queue */ struct tpacket_kbdq_core { struct pgv *pkbdq; unsigned int feature_req_word; unsigned int hdrlen; unsigned char reset_pending_on_curr_blk; unsigned short kactive_blk_num; unsigned short blk_sizeof_priv; unsigned short version; char *pkblk_start; char *pkblk_end; int kblk_size; unsigned int max_frame_len; unsigned int knum_blocks; uint64_t knxt_seq_num; char *prev; char *nxt_offset; struct sk_buff *skb; rwlock_t blk_fill_in_prog_lock; /* Default is set to 8ms */ #define DEFAULT_PRB_RETIRE_TOV (8) ktime_t interval_ktime; /* timer to retire an outstanding block */ struct hrtimer retire_blk_timer; }; struct pgv { char *buffer; }; struct packet_ring_buffer { struct pgv *pg_vec; unsigned int head; unsigned int frames_per_block; unsigned int frame_size; unsigned int frame_max; unsigned int pg_vec_order; unsigned int pg_vec_pages; unsigned int pg_vec_len; unsigned int __percpu *pending_refcnt; union { unsigned long *rx_owner_map; struct tpacket_kbdq_core prb_bdqc; }; }; extern struct mutex fanout_mutex; #define PACKET_FANOUT_MAX (1 << 16) struct packet_fanout { possible_net_t net; unsigned int num_members; u32 max_num_members; u16 id; u8 type; u8 flags; union { atomic_t rr_cur; struct bpf_prog __rcu *bpf_prog; }; struct list_head list; spinlock_t lock; refcount_t sk_ref; struct packet_type prot_hook ____cacheline_aligned_in_smp; struct sock __rcu *arr[] __counted_by(max_num_members); }; struct packet_rollover { int sock; atomic_long_t num; atomic_long_t num_huge; atomic_long_t num_failed; #define ROLLOVER_HLEN (L1_CACHE_BYTES / sizeof(u32)) u32 history[ROLLOVER_HLEN] ____cacheline_aligned; } ____cacheline_aligned_in_smp; struct packet_sock { /* struct sock has to be the first member of packet_sock */ struct sock sk; struct packet_fanout *fanout; union tpacket_stats_u stats; struct packet_ring_buffer rx_ring; struct packet_ring_buffer tx_ring; int copy_thresh; spinlock_t bind_lock; struct mutex pg_vec_lock; unsigned long flags; int ifindex; /* bound device */ u8 vnet_hdr_sz; __be16 num; struct packet_rollover *rollover; struct packet_mclist *mclist; atomic_long_t mapped; enum tpacket_versions tp_version; unsigned int tp_hdrlen; unsigned int tp_reserve; unsigned int tp_tstamp; struct completion skb_completion; struct net_device __rcu *cached_dev; struct packet_type prot_hook ____cacheline_aligned_in_smp; atomic_t tp_drops ____cacheline_aligned_in_smp; }; #define pkt_sk(ptr) container_of_const(ptr, struct packet_sock, sk) enum packet_sock_flags { PACKET_SOCK_ORIGDEV, PACKET_SOCK_AUXDATA, PACKET_SOCK_TX_HAS_OFF, PACKET_SOCK_TP_LOSS, PACKET_SOCK_RUNNING, PACKET_SOCK_PRESSURE, PACKET_SOCK_QDISC_BYPASS, }; static inline void packet_sock_flag_set(struct packet_sock *po, enum packet_sock_flags flag, bool val) { if (val) set_bit(flag, &po->flags); else clear_bit(flag, &po->flags); } static inline bool packet_sock_flag(const struct packet_sock *po, enum packet_sock_flags flag) { return test_bit(flag, &po->flags); } #endif
7 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 // SPDX-License-Identifier: GPL-2.0-or-later /* * LAPB release 002 * * This code REQUIRES 2.1.15 or higher/ NET3.038 * * History * LAPB 001 Jonathan Naylor Started Coding */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/inet.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/lapb.h> /* * This routine purges all the queues of frames. */ void lapb_clear_queues(struct lapb_cb *lapb) { skb_queue_purge(&lapb->write_queue); skb_queue_purge(&lapb->ack_queue); } /* * This routine purges the input queue of those frames that have been * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the * SDL diagram. */ void lapb_frames_acked(struct lapb_cb *lapb, unsigned short nr) { struct sk_buff *skb; int modulus; modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS; /* * Remove all the ack-ed frames from the ack queue. */ if (lapb->va != nr) while (skb_peek(&lapb->ack_queue) && lapb->va != nr) { skb = skb_dequeue(&lapb->ack_queue); kfree_skb(skb); lapb->va = (lapb->va + 1) % modulus; } } void lapb_requeue_frames(struct lapb_cb *lapb) { struct sk_buff *skb, *skb_prev = NULL; /* * Requeue all the un-ack-ed frames on the output queue to be picked * up by lapb_kick called from the timer. This arrangement handles the * possibility of an empty output queue. */ while ((skb = skb_dequeue(&lapb->ack_queue)) != NULL) { if (!skb_prev) skb_queue_head(&lapb->write_queue, skb); else skb_append(skb_prev, skb, &lapb->write_queue); skb_prev = skb; } } /* * Validate that the value of nr is between va and vs. Return true or * false for testing. */ int lapb_validate_nr(struct lapb_cb *lapb, unsigned short nr) { unsigned short vc = lapb->va; int modulus; modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS; while (vc != lapb->vs) { if (nr == vc) return 1; vc = (vc + 1) % modulus; } return nr == lapb->vs; } /* * This routine is the centralised routine for parsing the control * information for the different frame formats. */ int lapb_decode(struct lapb_cb *lapb, struct sk_buff *skb, struct lapb_frame *frame) { frame->type = LAPB_ILLEGAL; lapb_dbg(2, "(%p) S%d RX %3ph\n", lapb->dev, lapb->state, skb->data); /* We always need to look at 2 bytes, sometimes we need * to look at 3 and those cases are handled below. */ if (!pskb_may_pull(skb, 2)) return -1; if (lapb->mode & LAPB_MLP) { if (lapb->mode & LAPB_DCE) { if (skb->data[0] == LAPB_ADDR_D) frame->cr = LAPB_COMMAND; if (skb->data[0] == LAPB_ADDR_C) frame->cr = LAPB_RESPONSE; } else { if (skb->data[0] == LAPB_ADDR_C) frame->cr = LAPB_COMMAND; if (skb->data[0] == LAPB_ADDR_D) frame->cr = LAPB_RESPONSE; } } else { if (lapb->mode & LAPB_DCE) { if (skb->data[0] == LAPB_ADDR_B) frame->cr = LAPB_COMMAND; if (skb->data[0] == LAPB_ADDR_A) frame->cr = LAPB_RESPONSE; } else { if (skb->data[0] == LAPB_ADDR_A) frame->cr = LAPB_COMMAND; if (skb->data[0] == LAPB_ADDR_B) frame->cr = LAPB_RESPONSE; } } skb_pull(skb, 1); if (lapb->mode & LAPB_EXTENDED) { if (!(skb->data[0] & LAPB_S)) { if (!pskb_may_pull(skb, 2)) return -1; /* * I frame - carries NR/NS/PF */ frame->type = LAPB_I; frame->ns = (skb->data[0] >> 1) & 0x7F; frame->nr = (skb->data[1] >> 1) & 0x7F; frame->pf = skb->data[1] & LAPB_EPF; frame->control[0] = skb->data[0]; frame->control[1] = skb->data[1]; skb_pull(skb, 2); } else if ((skb->data[0] & LAPB_U) == 1) { if (!pskb_may_pull(skb, 2)) return -1; /* * S frame - take out PF/NR */ frame->type = skb->data[0] & 0x0F; frame->nr = (skb->data[1] >> 1) & 0x7F; frame->pf = skb->data[1] & LAPB_EPF; frame->control[0] = skb->data[0]; frame->control[1] = skb->data[1]; skb_pull(skb, 2); } else if ((skb->data[0] & LAPB_U) == 3) { /* * U frame - take out PF */ frame->type = skb->data[0] & ~LAPB_SPF; frame->pf = skb->data[0] & LAPB_SPF; frame->control[0] = skb->data[0]; frame->control[1] = 0x00; skb_pull(skb, 1); } } else { if (!(skb->data[0] & LAPB_S)) { /* * I frame - carries NR/NS/PF */ frame->type = LAPB_I; frame->ns = (skb->data[0] >> 1) & 0x07; frame->nr = (skb->data[0] >> 5) & 0x07; frame->pf = skb->data[0] & LAPB_SPF; } else if ((skb->data[0] & LAPB_U) == 1) { /* * S frame - take out PF/NR */ frame->type = skb->data[0] & 0x0F; frame->nr = (skb->data[0] >> 5) & 0x07; frame->pf = skb->data[0] & LAPB_SPF; } else if ((skb->data[0] & LAPB_U) == 3) { /* * U frame - take out PF */ frame->type = skb->data[0] & ~LAPB_SPF; frame->pf = skb->data[0] & LAPB_SPF; } frame->control[0] = skb->data[0]; skb_pull(skb, 1); } return 0; } /* * This routine is called when the HDLC layer internally generates a * command or response for the remote machine ( eg. RR, UA etc. ). * Only supervisory or unnumbered frames are processed, FRMRs are handled * by lapb_transmit_frmr below. */ void lapb_send_control(struct lapb_cb *lapb, int frametype, int poll_bit, int type) { struct sk_buff *skb; unsigned char *dptr; if ((skb = alloc_skb(LAPB_HEADER_LEN + 3, GFP_ATOMIC)) == NULL) return; skb_reserve(skb, LAPB_HEADER_LEN + 1); if (lapb->mode & LAPB_EXTENDED) { if ((frametype & LAPB_U) == LAPB_U) { dptr = skb_put(skb, 1); *dptr = frametype; *dptr |= poll_bit ? LAPB_SPF : 0; } else { dptr = skb_put(skb, 2); dptr[0] = frametype; dptr[1] = (lapb->vr << 1); dptr[1] |= poll_bit ? LAPB_EPF : 0; } } else { dptr = skb_put(skb, 1); *dptr = frametype; *dptr |= poll_bit ? LAPB_SPF : 0; if ((frametype & LAPB_U) == LAPB_S) /* S frames carry NR */ *dptr |= (lapb->vr << 5); } lapb_transmit_buffer(lapb, skb, type); } /* * This routine generates FRMRs based on information previously stored in * the LAPB control block. */ void lapb_transmit_frmr(struct lapb_cb *lapb) { struct sk_buff *skb; unsigned char *dptr; if ((skb = alloc_skb(LAPB_HEADER_LEN + 7, GFP_ATOMIC)) == NULL) return; skb_reserve(skb, LAPB_HEADER_LEN + 1); if (lapb->mode & LAPB_EXTENDED) { dptr = skb_put(skb, 6); *dptr++ = LAPB_FRMR; *dptr++ = lapb->frmr_data.control[0]; *dptr++ = lapb->frmr_data.control[1]; *dptr++ = (lapb->vs << 1) & 0xFE; *dptr = (lapb->vr << 1) & 0xFE; if (lapb->frmr_data.cr == LAPB_RESPONSE) *dptr |= 0x01; dptr++; *dptr++ = lapb->frmr_type; lapb_dbg(1, "(%p) S%d TX FRMR %5ph\n", lapb->dev, lapb->state, &skb->data[1]); } else { dptr = skb_put(skb, 4); *dptr++ = LAPB_FRMR; *dptr++ = lapb->frmr_data.control[0]; *dptr = (lapb->vs << 1) & 0x0E; *dptr |= (lapb->vr << 5) & 0xE0; if (lapb->frmr_data.cr == LAPB_RESPONSE) *dptr |= 0x10; dptr++; *dptr++ = lapb->frmr_type; lapb_dbg(1, "(%p) S%d TX FRMR %3ph\n", lapb->dev, lapb->state, &skb->data[1]); } lapb_transmit_buffer(lapb, skb, LAPB_RESPONSE); }
6612 6609 449 447 19252 13617 1783 4110 15928 7209 7206 101 2 7 94 71 1 2 95 95 70 1 27 95 38 18 18 18 18 18 18 18 4093 4091 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 /* SPDX-License-Identifier: GPL-2.0 */ /* * Implementation of the extensible bitmap type. * * Author : Stephen Smalley, <stephen.smalley.work@gmail.com> */ /* * Updated: Hewlett-Packard <paul@paul-moore.com> * Added support to import/export the NetLabel category bitmap * (c) Copyright Hewlett-Packard Development Company, L.P., 2006 * * Updated: KaiGai Kohei <kaigai@ak.jp.nec.com> * Applied standard bit operations to improve bitmap scanning. */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include <linux/jhash.h> #include <net/netlabel.h> #include "ebitmap.h" #include "policydb.h" #define BITS_PER_U64 ((u32)(sizeof(u64) * 8)) static struct kmem_cache *ebitmap_node_cachep __ro_after_init; bool ebitmap_equal(const struct ebitmap *e1, const struct ebitmap *e2) { const struct ebitmap_node *n1, *n2; if (e1->highbit != e2->highbit) return false; n1 = e1->node; n2 = e2->node; while (n1 && n2 && (n1->startbit == n2->startbit) && !memcmp(n1->maps, n2->maps, EBITMAP_SIZE / 8)) { n1 = n1->next; n2 = n2->next; } if (n1 || n2) return false; return true; } int ebitmap_cpy(struct ebitmap *dst, const struct ebitmap *src) { struct ebitmap_node *new, *prev; const struct ebitmap_node *n; ebitmap_init(dst); n = src->node; prev = NULL; while (n) { new = kmem_cache_zalloc(ebitmap_node_cachep, GFP_ATOMIC); if (!new) { ebitmap_destroy(dst); return -ENOMEM; } new->startbit = n->startbit; memcpy(new->maps, n->maps, EBITMAP_SIZE / 8); new->next = NULL; if (prev) prev->next = new; else dst->node = new; prev = new; n = n->next; } dst->highbit = src->highbit; return 0; } int ebitmap_and(struct ebitmap *dst, const struct ebitmap *e1, const struct ebitmap *e2) { struct ebitmap_node *n; u32 bit; int rc; ebitmap_init(dst); ebitmap_for_each_positive_bit(e1, n, bit) { if (ebitmap_get_bit(e2, bit)) { rc = ebitmap_set_bit(dst, bit, 1); if (rc < 0) return rc; } } return 0; } #ifdef CONFIG_NETLABEL /** * ebitmap_netlbl_export - Export an ebitmap into a NetLabel category bitmap * @ebmap: the ebitmap to export * @catmap: the NetLabel category bitmap * * Description: * Export a SELinux extensibile bitmap into a NetLabel category bitmap. * Returns zero on success, negative values on error. * */ int ebitmap_netlbl_export(struct ebitmap *ebmap, struct netlbl_lsm_catmap **catmap) { struct ebitmap_node *e_iter = ebmap->node; unsigned long e_map; u32 offset; unsigned int iter; int rc; if (e_iter == NULL) { *catmap = NULL; return 0; } if (*catmap != NULL) netlbl_catmap_free(*catmap); *catmap = NULL; while (e_iter) { offset = e_iter->startbit; for (iter = 0; iter < EBITMAP_UNIT_NUMS; iter++) { e_map = e_iter->maps[iter]; if (e_map != 0) { rc = netlbl_catmap_setlong(catmap, offset, e_map, GFP_ATOMIC); if (rc != 0) goto netlbl_export_failure; } offset += EBITMAP_UNIT_SIZE; } e_iter = e_iter->next; } return 0; netlbl_export_failure: netlbl_catmap_free(*catmap); return -ENOMEM; } /** * ebitmap_netlbl_import - Import a NetLabel category bitmap into an ebitmap * @ebmap: the ebitmap to import * @catmap: the NetLabel category bitmap * * Description: * Import a NetLabel category bitmap into a SELinux extensibile bitmap. * Returns zero on success, negative values on error. * */ int ebitmap_netlbl_import(struct ebitmap *ebmap, struct netlbl_lsm_catmap *catmap) { int rc; struct ebitmap_node *e_iter = NULL; struct ebitmap_node *e_prev = NULL; u32 offset = 0, idx; unsigned long bitmap; for (;;) { rc = netlbl_catmap_getlong(catmap, &offset, &bitmap); if (rc < 0) goto netlbl_import_failure; if (offset == (u32)-1) return 0; /* don't waste ebitmap space if the netlabel bitmap is empty */ if (bitmap == 0) { offset += EBITMAP_UNIT_SIZE; continue; } if (e_iter == NULL || offset >= e_iter->startbit + EBITMAP_SIZE) { e_prev = e_iter; e_iter = kmem_cache_zalloc(ebitmap_node_cachep, GFP_ATOMIC); if (e_iter == NULL) goto netlbl_import_failure; e_iter->startbit = offset - (offset % EBITMAP_SIZE); if (e_prev == NULL) ebmap->node = e_iter; else e_prev->next = e_iter; ebmap->highbit = e_iter->startbit + EBITMAP_SIZE; } /* offset will always be aligned to an unsigned long */ idx = EBITMAP_NODE_INDEX(e_iter, offset); e_iter->maps[idx] = bitmap; /* next */ offset += EBITMAP_UNIT_SIZE; } /* NOTE: we should never reach this return */ return 0; netlbl_import_failure: ebitmap_destroy(ebmap); return -ENOMEM; } #endif /* CONFIG_NETLABEL */ /* * Check to see if all the bits set in e2 are also set in e1. Optionally, * if last_e2bit is non-zero, the highest set bit in e2 cannot exceed * last_e2bit. */ int ebitmap_contains(const struct ebitmap *e1, const struct ebitmap *e2, u32 last_e2bit) { const struct ebitmap_node *n1, *n2; int i; if (e1->highbit < e2->highbit) return 0; n1 = e1->node; n2 = e2->node; while (n1 && n2 && (n1->startbit <= n2->startbit)) { if (n1->startbit < n2->startbit) { n1 = n1->next; continue; } for (i = EBITMAP_UNIT_NUMS - 1; (i >= 0) && !n2->maps[i];) i--; /* Skip trailing NULL map entries */ if (last_e2bit && (i >= 0)) { u32 lastsetbit = n2->startbit + i * EBITMAP_UNIT_SIZE + __fls(n2->maps[i]); if (lastsetbit > last_e2bit) return 0; } while (i >= 0) { if ((n1->maps[i] & n2->maps[i]) != n2->maps[i]) return 0; i--; } n1 = n1->next; n2 = n2->next; } if (n2) return 0; return 1; } int ebitmap_get_bit(const struct ebitmap *e, u32 bit) { const struct ebitmap_node *n; if (e->highbit < bit) return 0; n = e->node; while (n && (n->startbit <= bit)) { if ((n->startbit + EBITMAP_SIZE) > bit) return ebitmap_node_get_bit(n, bit); n = n->next; } return 0; } int ebitmap_set_bit(struct ebitmap *e, u32 bit, int value) { struct ebitmap_node *n, *prev, *new; prev = NULL; n = e->node; while (n && n->startbit <= bit) { if ((n->startbit + EBITMAP_SIZE) > bit) { if (value) { ebitmap_node_set_bit(n, bit); } else { u32 s; ebitmap_node_clr_bit(n, bit); s = find_first_bit(n->maps, EBITMAP_SIZE); if (s < EBITMAP_SIZE) return 0; /* drop this node from the bitmap */ if (!n->next) { /* * this was the highest map * within the bitmap */ if (prev) e->highbit = prev->startbit + EBITMAP_SIZE; else e->highbit = 0; } if (prev) prev->next = n->next; else e->node = n->next; kmem_cache_free(ebitmap_node_cachep, n); } return 0; } prev = n; n = n->next; } if (!value) return 0; new = kmem_cache_zalloc(ebitmap_node_cachep, GFP_ATOMIC); if (!new) return -ENOMEM; new->startbit = bit - (bit % EBITMAP_SIZE); ebitmap_node_set_bit(new, bit); if (!n) /* this node will be the highest map within the bitmap */ e->highbit = new->startbit + EBITMAP_SIZE; if (prev) { new->next = prev->next; prev->next = new; } else { new->next = e->node; e->node = new; } return 0; } void ebitmap_destroy(struct ebitmap *e) { struct ebitmap_node *n, *temp; if (!e) return; n = e->node; while (n) { temp = n; n = n->next; kmem_cache_free(ebitmap_node_cachep, temp); } e->highbit = 0; e->node = NULL; } int ebitmap_read(struct ebitmap *e, struct policy_file *fp) { struct ebitmap_node *n = NULL; u32 mapunit, count, startbit, index, i; __le32 ebitmap_start; u64 map; __le64 mapbits; __le32 buf[3]; int rc; ebitmap_init(e); rc = next_entry(buf, fp, sizeof buf); if (rc < 0) goto out; mapunit = le32_to_cpu(buf[0]); e->highbit = le32_to_cpu(buf[1]); count = le32_to_cpu(buf[2]); if (mapunit != BITS_PER_U64) { pr_err("SELinux: ebitmap: map size %u does not " "match my size %u (high bit was %u)\n", mapunit, BITS_PER_U64, e->highbit); goto bad; } /* round up e->highbit */ e->highbit += EBITMAP_SIZE - 1; e->highbit -= (e->highbit % EBITMAP_SIZE); if (!e->highbit) { e->node = NULL; goto ok; } if (e->highbit && !count) goto bad; for (i = 0; i < count; i++) { rc = next_entry(&ebitmap_start, fp, sizeof(u32)); if (rc < 0) { pr_err("SELinux: ebitmap: truncated map\n"); goto bad; } startbit = le32_to_cpu(ebitmap_start); if (startbit & (mapunit - 1)) { pr_err("SELinux: ebitmap start bit (%u) is " "not a multiple of the map unit size (%u)\n", startbit, mapunit); goto bad; } if (startbit > e->highbit - mapunit) { pr_err("SELinux: ebitmap start bit (%u) is " "beyond the end of the bitmap (%u)\n", startbit, (e->highbit - mapunit)); goto bad; } if (!n || startbit >= n->startbit + EBITMAP_SIZE) { struct ebitmap_node *tmp; tmp = kmem_cache_zalloc(ebitmap_node_cachep, GFP_KERNEL); if (!tmp) { pr_err("SELinux: ebitmap: out of memory\n"); rc = -ENOMEM; goto bad; } /* round down */ tmp->startbit = startbit - (startbit % EBITMAP_SIZE); if (n) n->next = tmp; else e->node = tmp; n = tmp; } else if (startbit <= n->startbit) { pr_err("SELinux: ebitmap: start bit %u" " comes after start bit %u\n", startbit, n->startbit); goto bad; } rc = next_entry(&mapbits, fp, sizeof(u64)); if (rc < 0) { pr_err("SELinux: ebitmap: truncated map\n"); goto bad; } map = le64_to_cpu(mapbits); if (!map) { pr_err("SELinux: ebitmap: empty map\n"); goto bad; } index = (startbit - n->startbit) / EBITMAP_UNIT_SIZE; while (map) { n->maps[index++] = map & (-1UL); map = EBITMAP_SHIFT_UNIT_SIZE(map); } } if (n && n->startbit + EBITMAP_SIZE != e->highbit) { pr_err("SELinux: ebitmap: high bit %u is not equal to the expected value %zu\n", e->highbit, n->startbit + EBITMAP_SIZE); goto bad; } ok: rc = 0; out: return rc; bad: if (!rc) rc = -EINVAL; ebitmap_destroy(e); goto out; } int ebitmap_write(const struct ebitmap *e, struct policy_file *fp) { struct ebitmap_node *n; u32 bit, count, last_bit, last_startbit; __le32 buf[3]; u64 map; int rc; buf[0] = cpu_to_le32(BITS_PER_U64); count = 0; last_bit = 0; last_startbit = U32_MAX; ebitmap_for_each_positive_bit(e, n, bit) { if (last_startbit == U32_MAX || rounddown(bit, BITS_PER_U64) > last_startbit) { count++; last_startbit = rounddown(bit, BITS_PER_U64); } last_bit = roundup(bit + 1, BITS_PER_U64); } buf[1] = cpu_to_le32(last_bit); buf[2] = cpu_to_le32(count); rc = put_entry(buf, sizeof(u32), 3, fp); if (rc) return rc; map = 0; last_startbit = U32_MAX; ebitmap_for_each_positive_bit(e, n, bit) { if (last_startbit == U32_MAX || rounddown(bit, BITS_PER_U64) > last_startbit) { __le64 buf64[1]; /* this is the very first bit */ if (!map) { last_startbit = rounddown(bit, BITS_PER_U64); map = (u64)1 << (bit - last_startbit); continue; } /* write the last node */ buf[0] = cpu_to_le32(last_startbit); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; buf64[0] = cpu_to_le64(map); rc = put_entry(buf64, sizeof(u64), 1, fp); if (rc) return rc; /* set up for the next node */ map = 0; last_startbit = rounddown(bit, BITS_PER_U64); } map |= (u64)1 << (bit - last_startbit); } /* write the last node */ if (map) { __le64 buf64[1]; /* write the last node */ buf[0] = cpu_to_le32(last_startbit); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; buf64[0] = cpu_to_le64(map); rc = put_entry(buf64, sizeof(u64), 1, fp); if (rc) return rc; } return 0; } u32 ebitmap_hash(const struct ebitmap *e, u32 hash) { struct ebitmap_node *node; /* need to change hash even if ebitmap is empty */ hash = jhash_1word(e->highbit, hash); for (node = e->node; node; node = node->next) { hash = jhash_1word(node->startbit, hash); hash = jhash(node->maps, sizeof(node->maps), hash); } return hash; } void __init ebitmap_cache_init(void) { ebitmap_node_cachep = KMEM_CACHE(ebitmap_node, SLAB_PANIC); }
43 43 21105 14 14 43 31 32 10731 31 1125 32 49 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_SEQLOCK_H #define __LINUX_SEQLOCK_H /* * seqcount_t / seqlock_t - a reader-writer consistency mechanism with * lockless readers (read-only retry loops), and no writer starvation. * * See Documentation/locking/seqlock.rst * * Copyrights: * - Based on x86_64 vsyscall gettimeofday: Keith Owens, Andrea Arcangeli * - Sequence counters with associated locks, (C) 2020 Linutronix GmbH */ #include <linux/compiler.h> #include <linux/kcsan-checks.h> #include <linux/lockdep.h> #include <linux/mutex.h> #include <linux/preempt.h> #include <linux/seqlock_types.h> #include <linux/spinlock.h> #include <asm/processor.h> /* * The seqlock seqcount_t interface does not prescribe a precise sequence of * read begin/retry/end. For readers, typically there is a call to * read_seqcount_begin() and read_seqcount_retry(), however, there are more * esoteric cases which do not follow this pattern. * * As a consequence, we take the following best-effort approach for raw usage * via seqcount_t under KCSAN: upon beginning a seq-reader critical section, * pessimistically mark the next KCSAN_SEQLOCK_REGION_MAX memory accesses as * atomics; if there is a matching read_seqcount_retry() call, no following * memory operations are considered atomic. Usage of the seqlock_t interface * is not affected. */ #define KCSAN_SEQLOCK_REGION_MAX 1000 static inline void __seqcount_init(seqcount_t *s, const char *name, struct lock_class_key *key) { /* * Make sure we are not reinitializing a held lock: */ lockdep_init_map(&s->dep_map, name, key, 0); s->sequence = 0; } #ifdef CONFIG_DEBUG_LOCK_ALLOC # define SEQCOUNT_DEP_MAP_INIT(lockname) \ .dep_map = { .name = #lockname } /** * seqcount_init() - runtime initializer for seqcount_t * @s: Pointer to the seqcount_t instance */ # define seqcount_init(s) \ do { \ static struct lock_class_key __key; \ __seqcount_init((s), #s, &__key); \ } while (0) static inline void seqcount_lockdep_reader_access(const seqcount_t *s) { seqcount_t *l = (seqcount_t *)s; unsigned long flags; local_irq_save(flags); seqcount_acquire_read(&l->dep_map, 0, 0, _RET_IP_); seqcount_release(&l->dep_map, _RET_IP_); local_irq_restore(flags); } #else # define SEQCOUNT_DEP_MAP_INIT(lockname) # define seqcount_init(s) __seqcount_init(s, NULL, NULL) # define seqcount_lockdep_reader_access(x) #endif /** * SEQCNT_ZERO() - static initializer for seqcount_t * @name: Name of the seqcount_t instance */ #define SEQCNT_ZERO(name) { .sequence = 0, SEQCOUNT_DEP_MAP_INIT(name) } /* * Sequence counters with associated locks (seqcount_LOCKNAME_t) * * A sequence counter which associates the lock used for writer * serialization at initialization time. This enables lockdep to validate * that the write side critical section is properly serialized. * * For associated locks which do not implicitly disable preemption, * preemption protection is enforced in the write side function. * * Lockdep is never used in any for the raw write variants. * * See Documentation/locking/seqlock.rst */ /* * typedef seqcount_LOCKNAME_t - sequence counter with LOCKNAME associated * @seqcount: The real sequence counter * @lock: Pointer to the associated lock * * A plain sequence counter with external writer synchronization by * LOCKNAME @lock. The lock is associated to the sequence counter in the * static initializer or init function. This enables lockdep to validate * that the write side critical section is properly serialized. * * LOCKNAME: raw_spinlock, spinlock, rwlock or mutex */ /* * seqcount_LOCKNAME_init() - runtime initializer for seqcount_LOCKNAME_t * @s: Pointer to the seqcount_LOCKNAME_t instance * @lock: Pointer to the associated lock */ #define seqcount_LOCKNAME_init(s, _lock, lockname) \ do { \ seqcount_##lockname##_t *____s = (s); \ seqcount_init(&____s->seqcount); \ __SEQ_LOCK(____s->lock = (_lock)); \ } while (0) #define seqcount_raw_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, raw_spinlock) #define seqcount_spinlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, spinlock) #define seqcount_rwlock_init(s, lock) seqcount_LOCKNAME_init(s, lock, rwlock) #define seqcount_mutex_init(s, lock) seqcount_LOCKNAME_init(s, lock, mutex) /* * SEQCOUNT_LOCKNAME() - Instantiate seqcount_LOCKNAME_t and helpers * seqprop_LOCKNAME_*() - Property accessors for seqcount_LOCKNAME_t * * @lockname: "LOCKNAME" part of seqcount_LOCKNAME_t * @locktype: LOCKNAME canonical C data type * @preemptible: preemptibility of above locktype * @lockbase: prefix for associated lock/unlock */ #define SEQCOUNT_LOCKNAME(lockname, locktype, preemptible, lockbase) \ static __always_inline seqcount_t * \ __seqprop_##lockname##_ptr(seqcount_##lockname##_t *s) \ { \ return &s->seqcount; \ } \ \ static __always_inline const seqcount_t * \ __seqprop_##lockname##_const_ptr(const seqcount_##lockname##_t *s) \ { \ return &s->seqcount; \ } \ \ static __always_inline unsigned \ __seqprop_##lockname##_sequence(const seqcount_##lockname##_t *s) \ { \ unsigned seq = smp_load_acquire(&s->seqcount.sequence); \ \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return seq; \ \ if (preemptible && unlikely(seq & 1)) { \ __SEQ_LOCK(lockbase##_lock(s->lock)); \ __SEQ_LOCK(lockbase##_unlock(s->lock)); \ \ /* \ * Re-read the sequence counter since the (possibly \ * preempted) writer made progress. \ */ \ seq = smp_load_acquire(&s->seqcount.sequence); \ } \ \ return seq; \ } \ \ static __always_inline bool \ __seqprop_##lockname##_preemptible(const seqcount_##lockname##_t *s) \ { \ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ return preemptible; \ \ /* PREEMPT_RT relies on the above LOCK+UNLOCK */ \ return false; \ } \ \ static __always_inline void \ __seqprop_##lockname##_assert(const seqcount_##lockname##_t *s) \ { \ __SEQ_LOCK(lockdep_assert_held(s->lock)); \ } /* * __seqprop() for seqcount_t */ static inline seqcount_t *__seqprop_ptr(seqcount_t *s) { return s; } static inline const seqcount_t *__seqprop_const_ptr(const seqcount_t *s) { return s; } static inline unsigned __seqprop_sequence(const seqcount_t *s) { return smp_load_acquire(&s->sequence); } static inline bool __seqprop_preemptible(const seqcount_t *s) { return false; } static inline void __seqprop_assert(const seqcount_t *s) { lockdep_assert_preemption_disabled(); } #define __SEQ_RT IS_ENABLED(CONFIG_PREEMPT_RT) SEQCOUNT_LOCKNAME(raw_spinlock, raw_spinlock_t, false, raw_spin) SEQCOUNT_LOCKNAME(spinlock, spinlock_t, __SEQ_RT, spin) SEQCOUNT_LOCKNAME(rwlock, rwlock_t, __SEQ_RT, read) SEQCOUNT_LOCKNAME(mutex, struct mutex, true, mutex) #undef SEQCOUNT_LOCKNAME /* * SEQCNT_LOCKNAME_ZERO - static initializer for seqcount_LOCKNAME_t * @name: Name of the seqcount_LOCKNAME_t instance * @lock: Pointer to the associated LOCKNAME */ #define SEQCOUNT_LOCKNAME_ZERO(seq_name, assoc_lock) { \ .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ __SEQ_LOCK(.lock = (assoc_lock)) \ } #define SEQCNT_RAW_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_SPINLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_RWLOCK_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define SEQCNT_WW_MUTEX_ZERO(name, lock) SEQCOUNT_LOCKNAME_ZERO(name, lock) #define __seqprop_case(s, lockname, prop) \ seqcount_##lockname##_t: __seqprop_##lockname##_##prop #define __seqprop(s, prop) _Generic(*(s), \ seqcount_t: __seqprop_##prop, \ __seqprop_case((s), raw_spinlock, prop), \ __seqprop_case((s), spinlock, prop), \ __seqprop_case((s), rwlock, prop), \ __seqprop_case((s), mutex, prop)) #define seqprop_ptr(s) __seqprop(s, ptr)(s) #define seqprop_const_ptr(s) __seqprop(s, const_ptr)(s) #define seqprop_sequence(s) __seqprop(s, sequence)(s) #define seqprop_preemptible(s) __seqprop(s, preemptible)(s) #define seqprop_assert(s) __seqprop(s, assert)(s) /** * __read_seqcount_begin() - begin a seqcount_t read section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ #define __read_seqcount_begin(s) \ ({ \ unsigned __seq; \ \ while (unlikely((__seq = seqprop_sequence(s)) & 1)) \ cpu_relax(); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \ }) /** * raw_read_seqcount_begin() - begin a seqcount_t read section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ #define raw_read_seqcount_begin(s) __read_seqcount_begin(s) /** * read_seqcount_begin() - begin a seqcount_t read critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Return: count to be passed to read_seqcount_retry() */ #define read_seqcount_begin(s) \ ({ \ seqcount_lockdep_reader_access(seqprop_const_ptr(s)); \ raw_read_seqcount_begin(s); \ }) /** * raw_read_seqcount() - read the raw seqcount_t counter value * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * raw_read_seqcount opens a read critical section of the given * seqcount_t, without any lockdep checking, and without checking or * masking the sequence counter LSB. Calling code is responsible for * handling that. * * Return: count to be passed to read_seqcount_retry() */ #define raw_read_seqcount(s) \ ({ \ unsigned __seq = seqprop_sequence(s); \ \ kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); \ __seq; \ }) /** * raw_seqcount_try_begin() - begin a seqcount_t read critical section * w/o lockdep and w/o counter stabilization * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count to be passed to read_seqcount_retry() * * Similar to raw_seqcount_begin(), except it enables eliding the critical * section entirely if odd, instead of doing the speculation knowing it will * fail. * * Useful when counter stabilization is more or less equivalent to taking * the lock and there is a slowpath that does that. * * If true, start will be set to the (even) sequence count read. * * Return: true when a read critical section is started. */ #define raw_seqcount_try_begin(s, start) \ ({ \ start = raw_read_seqcount(s); \ !(start & 1); \ }) /** * raw_seqcount_begin() - begin a seqcount_t read critical section w/o * lockdep and w/o counter stabilization * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * raw_seqcount_begin opens a read critical section of the given * seqcount_t. Unlike read_seqcount_begin(), this function will not wait * for the count to stabilize. If a writer is active when it begins, it * will fail the read_seqcount_retry() at the end of the read critical * section instead of stabilizing at the beginning of it. * * Use this only in special kernel hot paths where the read section is * small and has a high probability of success through other external * means. It will save a single branching instruction. * * Return: count to be passed to read_seqcount_retry() */ #define raw_seqcount_begin(s) \ ({ \ /* \ * If the counter is odd, let read_seqcount_retry() fail \ * by decrementing the counter. \ */ \ raw_read_seqcount(s) & ~1; \ }) /** * __read_seqcount_retry() - end a seqcount_t read section w/o barrier * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count, from read_seqcount_begin() * * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb() * barrier. Callers should ensure that smp_rmb() or equivalent ordering is * provided before actually loading any of the variables that are to be * protected in this critical section. * * Use carefully, only in critical code, and comment how the barrier is * provided. * * Return: true if a read section retry is required, else false */ #define __read_seqcount_retry(s, start) \ do___read_seqcount_retry(seqprop_const_ptr(s), start) static inline int do___read_seqcount_retry(const seqcount_t *s, unsigned start) { kcsan_atomic_next(0); return unlikely(READ_ONCE(s->sequence) != start); } /** * read_seqcount_retry() - end a seqcount_t read critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @start: count, from read_seqcount_begin() * * read_seqcount_retry closes the read critical section of given * seqcount_t. If the critical section was invalid, it must be ignored * (and typically retried). * * Return: true if a read section retry is required, else false */ #define read_seqcount_retry(s, start) \ do_read_seqcount_retry(seqprop_const_ptr(s), start) static inline int do_read_seqcount_retry(const seqcount_t *s, unsigned start) { smp_rmb(); return do___read_seqcount_retry(s, start); } /** * raw_write_seqcount_begin() - start a seqcount_t write section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: check write_seqcount_begin() */ #define raw_write_seqcount_begin(s) \ do { \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ do_raw_write_seqcount_begin(seqprop_ptr(s)); \ } while (0) static inline void do_raw_write_seqcount_begin(seqcount_t *s) { kcsan_nestable_atomic_begin(); s->sequence++; smp_wmb(); } /** * raw_write_seqcount_end() - end a seqcount_t write section w/o lockdep * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: check write_seqcount_end() */ #define raw_write_seqcount_end(s) \ do { \ do_raw_write_seqcount_end(seqprop_ptr(s)); \ \ if (seqprop_preemptible(s)) \ preempt_enable(); \ } while (0) static inline void do_raw_write_seqcount_end(seqcount_t *s) { smp_wmb(); s->sequence++; kcsan_nestable_atomic_end(); } /** * write_seqcount_begin_nested() - start a seqcount_t write section with * custom lockdep nesting level * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * @subclass: lockdep nesting level * * See Documentation/locking/lockdep-design.rst * Context: check write_seqcount_begin() */ #define write_seqcount_begin_nested(s, subclass) \ do { \ seqprop_assert(s); \ \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ do_write_seqcount_begin_nested(seqprop_ptr(s), subclass); \ } while (0) static inline void do_write_seqcount_begin_nested(seqcount_t *s, int subclass) { seqcount_acquire(&s->dep_map, subclass, 0, _RET_IP_); do_raw_write_seqcount_begin(s); } /** * write_seqcount_begin() - start a seqcount_t write side critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: sequence counter write side sections must be serialized and * non-preemptible. Preemption will be automatically disabled if and * only if the seqcount write serialization lock is associated, and * preemptible. If readers can be invoked from hardirq or softirq * context, interrupts or bottom halves must be respectively disabled. */ #define write_seqcount_begin(s) \ do { \ seqprop_assert(s); \ \ if (seqprop_preemptible(s)) \ preempt_disable(); \ \ do_write_seqcount_begin(seqprop_ptr(s)); \ } while (0) static inline void do_write_seqcount_begin(seqcount_t *s) { do_write_seqcount_begin_nested(s, 0); } /** * write_seqcount_end() - end a seqcount_t write side critical section * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * Context: Preemption will be automatically re-enabled if and only if * the seqcount write serialization lock is associated, and preemptible. */ #define write_seqcount_end(s) \ do { \ do_write_seqcount_end(seqprop_ptr(s)); \ \ if (seqprop_preemptible(s)) \ preempt_enable(); \ } while (0) static inline void do_write_seqcount_end(seqcount_t *s) { seqcount_release(&s->dep_map, _RET_IP_); do_raw_write_seqcount_end(s); } /** * raw_write_seqcount_barrier() - do a seqcount_t write barrier * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * This can be used to provide an ordering guarantee instead of the usual * consistency guarantee. It is one wmb cheaper, because it can collapse * the two back-to-back wmb()s. * * Note that writes surrounding the barrier should be declared atomic (e.g. * via WRITE_ONCE): a) to ensure the writes become visible to other threads * atomically, avoiding compiler optimizations; b) to document which writes are * meant to propagate to the reader critical section. This is necessary because * neither writes before nor after the barrier are enclosed in a seq-writer * critical section that would ensure readers are aware of ongoing writes:: * * seqcount_t seq; * bool X = true, Y = false; * * void read(void) * { * bool x, y; * * do { * int s = read_seqcount_begin(&seq); * * x = X; y = Y; * * } while (read_seqcount_retry(&seq, s)); * * BUG_ON(!x && !y); * } * * void write(void) * { * WRITE_ONCE(Y, true); * * raw_write_seqcount_barrier(seq); * * WRITE_ONCE(X, false); * } */ #define raw_write_seqcount_barrier(s) \ do_raw_write_seqcount_barrier(seqprop_ptr(s)) static inline void do_raw_write_seqcount_barrier(seqcount_t *s) { kcsan_nestable_atomic_begin(); s->sequence++; smp_wmb(); s->sequence++; kcsan_nestable_atomic_end(); } /** * write_seqcount_invalidate() - invalidate in-progress seqcount_t read * side operations * @s: Pointer to seqcount_t or any of the seqcount_LOCKNAME_t variants * * After write_seqcount_invalidate, no seqcount_t read side operations * will complete successfully and see data older than this. */ #define write_seqcount_invalidate(s) \ do_write_seqcount_invalidate(seqprop_ptr(s)) static inline void do_write_seqcount_invalidate(seqcount_t *s) { smp_wmb(); kcsan_nestable_atomic_begin(); s->sequence+=2; kcsan_nestable_atomic_end(); } /* * Latch sequence counters (seqcount_latch_t) * * A sequence counter variant where the counter even/odd value is used to * switch between two copies of protected data. This allows the read path, * typically NMIs, to safely interrupt the write side critical section. * * As the write sections are fully preemptible, no special handling for * PREEMPT_RT is needed. */ typedef struct { seqcount_t seqcount; } seqcount_latch_t; /** * SEQCNT_LATCH_ZERO() - static initializer for seqcount_latch_t * @seq_name: Name of the seqcount_latch_t instance */ #define SEQCNT_LATCH_ZERO(seq_name) { \ .seqcount = SEQCNT_ZERO(seq_name.seqcount), \ } /** * seqcount_latch_init() - runtime initializer for seqcount_latch_t * @s: Pointer to the seqcount_latch_t instance */ #define seqcount_latch_init(s) seqcount_init(&(s)->seqcount) /** * raw_read_seqcount_latch() - pick even/odd latch data copy * @s: Pointer to seqcount_latch_t * * See raw_write_seqcount_latch() for details and a full reader/writer * usage example. * * Return: sequence counter raw value. Use the lowest bit as an index for * picking which data copy to read. The full counter must then be checked * with raw_read_seqcount_latch_retry(). */ static __always_inline unsigned raw_read_seqcount_latch(const seqcount_latch_t *s) { /* * Pairs with the first smp_wmb() in raw_write_seqcount_latch(). * Due to the dependent load, a full smp_rmb() is not needed. */ return READ_ONCE(s->seqcount.sequence); } /** * read_seqcount_latch() - pick even/odd latch data copy * @s: Pointer to seqcount_latch_t * * See write_seqcount_latch() for details and a full reader/writer usage * example. * * Return: sequence counter raw value. Use the lowest bit as an index for * picking which data copy to read. The full counter must then be checked * with read_seqcount_latch_retry(). */ static __always_inline unsigned read_seqcount_latch(const seqcount_latch_t *s) { kcsan_atomic_next(KCSAN_SEQLOCK_REGION_MAX); return raw_read_seqcount_latch(s); } /** * raw_read_seqcount_latch_retry() - end a seqcount_latch_t read section * @s: Pointer to seqcount_latch_t * @start: count, from raw_read_seqcount_latch() * * Return: true if a read section retry is required, else false */ static __always_inline int raw_read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) { smp_rmb(); return unlikely(READ_ONCE(s->seqcount.sequence) != start); } /** * read_seqcount_latch_retry() - end a seqcount_latch_t read section * @s: Pointer to seqcount_latch_t * @start: count, from read_seqcount_latch() * * Return: true if a read section retry is required, else false */ static __always_inline int read_seqcount_latch_retry(const seqcount_latch_t *s, unsigned start) { kcsan_atomic_next(0); return raw_read_seqcount_latch_retry(s, start); } /** * raw_write_seqcount_latch() - redirect latch readers to even/odd copy * @s: Pointer to seqcount_latch_t */ static __always_inline void raw_write_seqcount_latch(seqcount_latch_t *s) { smp_wmb(); /* prior stores before incrementing "sequence" */ s->seqcount.sequence++; smp_wmb(); /* increment "sequence" before following stores */ } /** * write_seqcount_latch_begin() - redirect latch readers to odd copy * @s: Pointer to seqcount_latch_t * * The latch technique is a multiversion concurrency control method that allows * queries during non-atomic modifications. If you can guarantee queries never * interrupt the modification -- e.g. the concurrency is strictly between CPUs * -- you most likely do not need this. * * Where the traditional RCU/lockless data structures rely on atomic * modifications to ensure queries observe either the old or the new state the * latch allows the same for non-atomic updates. The trade-off is doubling the * cost of storage; we have to maintain two copies of the entire data * structure. * * Very simply put: we first modify one copy and then the other. This ensures * there is always one copy in a stable state, ready to give us an answer. * * The basic form is a data structure like:: * * struct latch_struct { * seqcount_latch_t seq; * struct data_struct data[2]; * }; * * Where a modification, which is assumed to be externally serialized, does the * following:: * * void latch_modify(struct latch_struct *latch, ...) * { * write_seqcount_latch_begin(&latch->seq); * modify(latch->data[0], ...); * write_seqcount_latch(&latch->seq); * modify(latch->data[1], ...); * write_seqcount_latch_end(&latch->seq); * } * * The query will have a form like:: * * struct entry *latch_query(struct latch_struct *latch, ...) * { * struct entry *entry; * unsigned seq, idx; * * do { * seq = read_seqcount_latch(&latch->seq); * * idx = seq & 0x01; * entry = data_query(latch->data[idx], ...); * * // This includes needed smp_rmb() * } while (read_seqcount_latch_retry(&latch->seq, seq)); * * return entry; * } * * So during the modification, queries are first redirected to data[1]. Then we * modify data[0]. When that is complete, we redirect queries back to data[0] * and we can modify data[1]. * * NOTE: * * The non-requirement for atomic modifications does _NOT_ include * the publishing of new entries in the case where data is a dynamic * data structure. * * An iteration might start in data[0] and get suspended long enough * to miss an entire modification sequence, once it resumes it might * observe the new entry. * * NOTE2: * * When data is a dynamic data structure; one should use regular RCU * patterns to manage the lifetimes of the objects within. */ static __always_inline void write_seqcount_latch_begin(seqcount_latch_t *s) { kcsan_nestable_atomic_begin(); raw_write_seqcount_latch(s); } /** * write_seqcount_latch() - redirect latch readers to even copy * @s: Pointer to seqcount_latch_t */ static __always_inline void write_seqcount_latch(seqcount_latch_t *s) { raw_write_seqcount_latch(s); } /** * write_seqcount_latch_end() - end a seqcount_latch_t write section * @s: Pointer to seqcount_latch_t * * Marks the end of a seqcount_latch_t writer section, after all copies of the * latch-protected data have been updated. */ static __always_inline void write_seqcount_latch_end(seqcount_latch_t *s) { kcsan_nestable_atomic_end(); } #define __SEQLOCK_UNLOCKED(lockname) \ { \ .seqcount = SEQCNT_SPINLOCK_ZERO(lockname, &(lockname).lock), \ .lock = __SPIN_LOCK_UNLOCKED(lockname) \ } /** * seqlock_init() - dynamic initializer for seqlock_t * @sl: Pointer to the seqlock_t instance */ #define seqlock_init(sl) \ do { \ spin_lock_init(&(sl)->lock); \ seqcount_spinlock_init(&(sl)->seqcount, &(sl)->lock); \ } while (0) /** * DEFINE_SEQLOCK(sl) - Define a statically allocated seqlock_t * @sl: Name of the seqlock_t instance */ #define DEFINE_SEQLOCK(sl) \ seqlock_t sl = __SEQLOCK_UNLOCKED(sl) /** * read_seqbegin() - start a seqlock_t read side critical section * @sl: Pointer to seqlock_t * * Return: count, to be passed to read_seqretry() */ static inline unsigned read_seqbegin(const seqlock_t *sl) { return read_seqcount_begin(&sl->seqcount); } /** * read_seqretry() - end a seqlock_t read side section * @sl: Pointer to seqlock_t * @start: count, from read_seqbegin() * * read_seqretry closes the read side critical section of given seqlock_t. * If the critical section was invalid, it must be ignored (and typically * retried). * * Return: true if a read section retry is required, else false */ static inline unsigned read_seqretry(const seqlock_t *sl, unsigned start) { return read_seqcount_retry(&sl->seqcount, start); } /* * For all seqlock_t write side functions, use the internal * do_write_seqcount_begin() instead of generic write_seqcount_begin(). * This way, no redundant lockdep_assert_held() checks are added. */ /** * write_seqlock() - start a seqlock_t write side critical section * @sl: Pointer to seqlock_t * * write_seqlock opens a write side critical section for the given * seqlock_t. It also implicitly acquires the spinlock_t embedded inside * that sequential lock. All seqlock_t write side sections are thus * automatically serialized and non-preemptible. * * Context: if the seqlock_t read section, or other write side critical * sections, can be invoked from hardirq or softirq contexts, use the * _irqsave or _bh variants of this function instead. */ static inline void write_seqlock(seqlock_t *sl) { spin_lock(&sl->lock); do_write_seqcount_begin(&sl->seqcount.seqcount); } /** * write_sequnlock() - end a seqlock_t write side critical section * @sl: Pointer to seqlock_t * * write_sequnlock closes the (serialized and non-preemptible) write side * critical section of given seqlock_t. */ static inline void write_sequnlock(seqlock_t *sl) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock(&sl->lock); } /** * write_seqlock_bh() - start a softirqs-disabled seqlock_t write section * @sl: Pointer to seqlock_t * * _bh variant of write_seqlock(). Use only if the read side section, or * other write side sections, can be invoked from softirq contexts. */ static inline void write_seqlock_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); do_write_seqcount_begin(&sl->seqcount.seqcount); } /** * write_sequnlock_bh() - end a softirqs-disabled seqlock_t write section * @sl: Pointer to seqlock_t * * write_sequnlock_bh closes the serialized, non-preemptible, and * softirqs-disabled, seqlock_t write side critical section opened with * write_seqlock_bh(). */ static inline void write_sequnlock_bh(seqlock_t *sl) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_bh(&sl->lock); } /** * write_seqlock_irq() - start a non-interruptible seqlock_t write section * @sl: Pointer to seqlock_t * * _irq variant of write_seqlock(). Use only if the read side section, or * other write sections, can be invoked from hardirq contexts. */ static inline void write_seqlock_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); do_write_seqcount_begin(&sl->seqcount.seqcount); } /** * write_sequnlock_irq() - end a non-interruptible seqlock_t write section * @sl: Pointer to seqlock_t * * write_sequnlock_irq closes the serialized and non-interruptible * seqlock_t write side section opened with write_seqlock_irq(). */ static inline void write_sequnlock_irq(seqlock_t *sl) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_irq(&sl->lock); } static inline unsigned long __write_seqlock_irqsave(seqlock_t *sl) { unsigned long flags; spin_lock_irqsave(&sl->lock, flags); do_write_seqcount_begin(&sl->seqcount.seqcount); return flags; } /** * write_seqlock_irqsave() - start a non-interruptible seqlock_t write * section * @lock: Pointer to seqlock_t * @flags: Stack-allocated storage for saving caller's local interrupt * state, to be passed to write_sequnlock_irqrestore(). * * _irqsave variant of write_seqlock(). Use it only if the read side * section, or other write sections, can be invoked from hardirq context. */ #define write_seqlock_irqsave(lock, flags) \ do { flags = __write_seqlock_irqsave(lock); } while (0) /** * write_sequnlock_irqrestore() - end non-interruptible seqlock_t write * section * @sl: Pointer to seqlock_t * @flags: Caller's saved interrupt state, from write_seqlock_irqsave() * * write_sequnlock_irqrestore closes the serialized and non-interruptible * seqlock_t write section previously opened with write_seqlock_irqsave(). */ static inline void write_sequnlock_irqrestore(seqlock_t *sl, unsigned long flags) { do_write_seqcount_end(&sl->seqcount.seqcount); spin_unlock_irqrestore(&sl->lock, flags); } /** * read_seqlock_excl() - begin a seqlock_t locking reader section * @sl: Pointer to seqlock_t * * read_seqlock_excl opens a seqlock_t locking reader critical section. A * locking reader exclusively locks out *both* other writers *and* other * locking readers, but it does not update the embedded sequence number. * * Locking readers act like a normal spin_lock()/spin_unlock(). * * Context: if the seqlock_t write section, *or other read sections*, can * be invoked from hardirq or softirq contexts, use the _irqsave or _bh * variant of this function instead. * * The opened read section must be closed with read_sequnlock_excl(). */ static inline void read_seqlock_excl(seqlock_t *sl) { spin_lock(&sl->lock); } /** * read_sequnlock_excl() - end a seqlock_t locking reader critical section * @sl: Pointer to seqlock_t */ static inline void read_sequnlock_excl(seqlock_t *sl) { spin_unlock(&sl->lock); } /** * read_seqlock_excl_bh() - start a seqlock_t locking reader section with * softirqs disabled * @sl: Pointer to seqlock_t * * _bh variant of read_seqlock_excl(). Use this variant only if the * seqlock_t write side section, *or other read sections*, can be invoked * from softirq contexts. */ static inline void read_seqlock_excl_bh(seqlock_t *sl) { spin_lock_bh(&sl->lock); } /** * read_sequnlock_excl_bh() - stop a seqlock_t softirq-disabled locking * reader section * @sl: Pointer to seqlock_t */ static inline void read_sequnlock_excl_bh(seqlock_t *sl) { spin_unlock_bh(&sl->lock); } /** * read_seqlock_excl_irq() - start a non-interruptible seqlock_t locking * reader section * @sl: Pointer to seqlock_t * * _irq variant of read_seqlock_excl(). Use this only if the seqlock_t * write side section, *or other read sections*, can be invoked from a * hardirq context. */ static inline void read_seqlock_excl_irq(seqlock_t *sl) { spin_lock_irq(&sl->lock); } /** * read_sequnlock_excl_irq() - end an interrupts-disabled seqlock_t * locking reader section * @sl: Pointer to seqlock_t */ static inline void read_sequnlock_excl_irq(seqlock_t *sl) { spin_unlock_irq(&sl->lock); } static inline unsigned long __read_seqlock_excl_irqsave(seqlock_t *sl) { unsigned long flags; spin_lock_irqsave(&sl->lock, flags); return flags; } /** * read_seqlock_excl_irqsave() - start a non-interruptible seqlock_t * locking reader section * @lock: Pointer to seqlock_t * @flags: Stack-allocated storage for saving caller's local interrupt * state, to be passed to read_sequnlock_excl_irqrestore(). * * _irqsave variant of read_seqlock_excl(). Use this only if the seqlock_t * write side section, *or other read sections*, can be invoked from a * hardirq context. */ #define read_seqlock_excl_irqsave(lock, flags) \ do { flags = __read_seqlock_excl_irqsave(lock); } while (0) /** * read_sequnlock_excl_irqrestore() - end non-interruptible seqlock_t * locking reader section * @sl: Pointer to seqlock_t * @flags: Caller saved interrupt state, from read_seqlock_excl_irqsave() */ static inline void read_sequnlock_excl_irqrestore(seqlock_t *sl, unsigned long flags) { spin_unlock_irqrestore(&sl->lock, flags); } /** * read_seqbegin_or_lock() - begin a seqlock_t lockless or locking reader * @lock: Pointer to seqlock_t * @seq : Marker and return parameter. If the passed value is even, the * reader will become a *lockless* seqlock_t reader as in read_seqbegin(). * If the passed value is odd, the reader will become a *locking* reader * as in read_seqlock_excl(). In the first call to this function, the * caller *must* initialize and pass an even value to @seq; this way, a * lockless read can be optimistically tried first. * * read_seqbegin_or_lock is an API designed to optimistically try a normal * lockless seqlock_t read section first. If an odd counter is found, the * lockless read trial has failed, and the next read iteration transforms * itself into a full seqlock_t locking reader. * * This is typically used to avoid seqlock_t lockless readers starvation * (too much retry loops) in the case of a sharp spike in write side * activity. * * Context: if the seqlock_t write section, *or other read sections*, can * be invoked from hardirq or softirq contexts, use the _irqsave or _bh * variant of this function instead. * * Check Documentation/locking/seqlock.rst for template example code. * * Return: the encountered sequence counter value, through the @seq * parameter, which is overloaded as a return parameter. This returned * value must be checked with need_seqretry(). If the read section need to * be retried, this returned value must also be passed as the @seq * parameter of the next read_seqbegin_or_lock() iteration. */ static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq) { if (!(*seq & 1)) /* Even */ *seq = read_seqbegin(lock); else /* Odd */ read_seqlock_excl(lock); } /** * need_seqretry() - validate seqlock_t "locking or lockless" read section * @lock: Pointer to seqlock_t * @seq: sequence count, from read_seqbegin_or_lock() * * Return: true if a read section retry is required, false otherwise */ static inline int need_seqretry(seqlock_t *lock, int seq) { return !(seq & 1) && read_seqretry(lock, seq); } /** * done_seqretry() - end seqlock_t "locking or lockless" reader section * @lock: Pointer to seqlock_t * @seq: count, from read_seqbegin_or_lock() * * done_seqretry finishes the seqlock_t read side critical section started * with read_seqbegin_or_lock() and validated by need_seqretry(). */ static inline void done_seqretry(seqlock_t *lock, int seq) { if (seq & 1) read_sequnlock_excl(lock); } /** * read_seqbegin_or_lock_irqsave() - begin a seqlock_t lockless reader, or * a non-interruptible locking reader * @lock: Pointer to seqlock_t * @seq: Marker and return parameter. Check read_seqbegin_or_lock(). * * This is the _irqsave variant of read_seqbegin_or_lock(). Use it only if * the seqlock_t write section, *or other read sections*, can be invoked * from hardirq context. * * Note: Interrupts will be disabled only for "locking reader" mode. * * Return: * * 1. The saved local interrupts state in case of a locking reader, to * be passed to done_seqretry_irqrestore(). * * 2. The encountered sequence counter value, returned through @seq * overloaded as a return parameter. Check read_seqbegin_or_lock(). */ static inline unsigned long read_seqbegin_or_lock_irqsave(seqlock_t *lock, int *seq) { unsigned long flags = 0; if (!(*seq & 1)) /* Even */ *seq = read_seqbegin(lock); else /* Odd */ read_seqlock_excl_irqsave(lock, flags); return flags; } /** * done_seqretry_irqrestore() - end a seqlock_t lockless reader, or a * non-interruptible locking reader section * @lock: Pointer to seqlock_t * @seq: Count, from read_seqbegin_or_lock_irqsave() * @flags: Caller's saved local interrupt state in case of a locking * reader, also from read_seqbegin_or_lock_irqsave() * * This is the _irqrestore variant of done_seqretry(). The read section * must've been opened with read_seqbegin_or_lock_irqsave(), and validated * by need_seqretry(). */ static inline void done_seqretry_irqrestore(seqlock_t *lock, int seq, unsigned long flags) { if (seq & 1) read_sequnlock_excl_irqrestore(lock, flags); } #endif /* __LINUX_SEQLOCK_H */
317 990 510 301 15 679 1 1 406 630 406 721 472 679 1245 721 374 505 1 303 3 1 228 62 39 73 76 68 66 23 206 281 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct ptr_ring' datastructure. * * Author: * Michael S. Tsirkin <mst@redhat.com> * * Copyright (C) 2016 Red Hat, Inc. * * This is a limited-size FIFO maintaining pointers in FIFO order, with * one CPU producing entries and another consuming entries from a FIFO. * * This implementation tries to minimize cache-contention when there is a * single producer and a single consumer CPU. */ #ifndef _LINUX_PTR_RING_H #define _LINUX_PTR_RING_H 1 #ifdef __KERNEL__ #include <linux/spinlock.h> #include <linux/cache.h> #include <linux/types.h> #include <linux/compiler.h> #include <linux/slab.h> #include <linux/mm.h> #include <asm/errno.h> #endif struct ptr_ring { int producer ____cacheline_aligned_in_smp; spinlock_t producer_lock; int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */ int consumer_tail; /* next entry to invalidate */ spinlock_t consumer_lock; /* Shared consumer/producer data */ /* Read-only by both the producer and the consumer */ int size ____cacheline_aligned_in_smp; /* max entries in queue */ int batch; /* number of entries to consume in a batch */ void **queue; }; /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). * * NB: this is unlike __ptr_ring_empty in that callers must hold producer_lock: * see e.g. ptr_ring_full. */ static inline bool __ptr_ring_full(struct ptr_ring *r) { return r->queue[r->producer]; } static inline bool ptr_ring_full(struct ptr_ring *r) { bool ret; spin_lock(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock(&r->producer_lock); return ret; } static inline bool ptr_ring_full_irq(struct ptr_ring *r) { bool ret; spin_lock_irq(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock_irq(&r->producer_lock); return ret; } static inline bool ptr_ring_full_any(struct ptr_ring *r) { unsigned long flags; bool ret; spin_lock_irqsave(&r->producer_lock, flags); ret = __ptr_ring_full(r); spin_unlock_irqrestore(&r->producer_lock, flags); return ret; } static inline bool ptr_ring_full_bh(struct ptr_ring *r) { bool ret; spin_lock_bh(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock_bh(&r->producer_lock); return ret; } /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). Callers must hold producer_lock. * Callers are responsible for making sure pointer that is being queued * points to a valid data. */ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) { if (unlikely(!r->size) || r->queue[r->producer]) return -ENOSPC; /* Make sure the pointer we are storing points to a valid data. */ /* Pairs with the dependency ordering in __ptr_ring_consume. */ smp_wmb(); WRITE_ONCE(r->queue[r->producer++], ptr); if (unlikely(r->producer >= r->size)) r->producer = 0; return 0; } /* * Note: resize (below) nests producer lock within consumer lock, so if you * consume in interrupt or BH context, you must disable interrupts/BH when * calling this. */ static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr) { int ret; spin_lock(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock(&r->producer_lock); return ret; } static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr) { int ret; spin_lock_irq(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock_irq(&r->producer_lock); return ret; } static inline int ptr_ring_produce_any(struct ptr_ring *r, void *ptr) { unsigned long flags; int ret; spin_lock_irqsave(&r->producer_lock, flags); ret = __ptr_ring_produce(r, ptr); spin_unlock_irqrestore(&r->producer_lock, flags); return ret; } static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr) { int ret; spin_lock_bh(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock_bh(&r->producer_lock); return ret; } static inline void *__ptr_ring_peek(struct ptr_ring *r) { if (likely(r->size)) return READ_ONCE(r->queue[r->consumer_head]); return NULL; } /* * Test ring empty status without taking any locks. * * NB: This is only safe to call if ring is never resized. * * However, if some other CPU consumes ring entries at the same time, the value * returned is not guaranteed to be correct. * * In this case - to avoid incorrectly detecting the ring * as empty - the CPU consuming the ring entries is responsible * for either consuming all ring entries until the ring is empty, * or synchronizing with some other CPU and causing it to * re-test __ptr_ring_empty and/or consume the ring enteries * after the synchronization point. * * Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). */ static inline bool __ptr_ring_empty(struct ptr_ring *r) { if (likely(r->size)) return !r->queue[READ_ONCE(r->consumer_head)]; return true; } static inline bool ptr_ring_empty(struct ptr_ring *r) { bool ret; spin_lock(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock(&r->consumer_lock); return ret; } static inline bool ptr_ring_empty_irq(struct ptr_ring *r) { bool ret; spin_lock_irq(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock_irq(&r->consumer_lock); return ret; } static inline bool ptr_ring_empty_any(struct ptr_ring *r) { unsigned long flags; bool ret; spin_lock_irqsave(&r->consumer_lock, flags); ret = __ptr_ring_empty(r); spin_unlock_irqrestore(&r->consumer_lock, flags); return ret; } static inline bool ptr_ring_empty_bh(struct ptr_ring *r) { bool ret; spin_lock_bh(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock_bh(&r->consumer_lock); return ret; } /* Zero entries from tail to specified head. * NB: if consumer_head can be >= r->size need to fixup tail later. */ static inline void __ptr_ring_zero_tail(struct ptr_ring *r, int consumer_head) { int head = consumer_head; /* Zero out entries in the reverse order: this way we touch the * cache line that producer might currently be reading the last; * producer won't make progress and touch other cache lines * besides the first one until we write out all entries. */ while (likely(head > r->consumer_tail)) r->queue[--head] = NULL; r->consumer_tail = consumer_head; } /* Must only be called after __ptr_ring_peek returned !NULL */ static inline void __ptr_ring_discard_one(struct ptr_ring *r) { /* Fundamentally, what we want to do is update consumer * index and zero out the entry so producer can reuse it. * Doing it naively at each consume would be as simple as: * consumer = r->consumer; * r->queue[consumer++] = NULL; * if (unlikely(consumer >= r->size)) * consumer = 0; * r->consumer = consumer; * but that is suboptimal when the ring is full as producer is writing * out new entries in the same cache line. Defer these updates until a * batch of entries has been consumed. */ /* Note: we must keep consumer_head valid at all times for __ptr_ring_empty * to work correctly. */ int consumer_head = r->consumer_head + 1; /* Once we have processed enough entries invalidate them in * the ring all at once so producer can reuse their space in the ring. * We also do this when we reach end of the ring - not mandatory * but helps keep the implementation simple. */ if (unlikely(consumer_head - r->consumer_tail >= r->batch || consumer_head >= r->size)) __ptr_ring_zero_tail(r, consumer_head); if (unlikely(consumer_head >= r->size)) { consumer_head = 0; r->consumer_tail = 0; } /* matching READ_ONCE in __ptr_ring_empty for lockless tests */ WRITE_ONCE(r->consumer_head, consumer_head); } static inline void *__ptr_ring_consume(struct ptr_ring *r) { void *ptr; /* The READ_ONCE in __ptr_ring_peek guarantees that anyone * accessing data through the pointer is up to date. Pairs * with smp_wmb in __ptr_ring_produce. */ ptr = __ptr_ring_peek(r); if (ptr) __ptr_ring_discard_one(r); return ptr; } static inline int __ptr_ring_consume_batched(struct ptr_ring *r, void **array, int n) { void *ptr; int i; for (i = 0; i < n; i++) { ptr = __ptr_ring_consume(r); if (!ptr) break; array[i] = ptr; } return i; } /* * Note: resize (below) nests producer lock within consumer lock, so if you * call this in interrupt or BH context, you must disable interrupts/BH when * producing. */ static inline void *ptr_ring_consume(struct ptr_ring *r) { void *ptr; spin_lock(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock(&r->consumer_lock); return ptr; } static inline void *ptr_ring_consume_irq(struct ptr_ring *r) { void *ptr; spin_lock_irq(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock_irq(&r->consumer_lock); return ptr; } static inline void *ptr_ring_consume_any(struct ptr_ring *r) { unsigned long flags; void *ptr; spin_lock_irqsave(&r->consumer_lock, flags); ptr = __ptr_ring_consume(r); spin_unlock_irqrestore(&r->consumer_lock, flags); return ptr; } static inline void *ptr_ring_consume_bh(struct ptr_ring *r) { void *ptr; spin_lock_bh(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock_bh(&r->consumer_lock); return ptr; } static inline int ptr_ring_consume_batched(struct ptr_ring *r, void **array, int n) { int ret; spin_lock(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock(&r->consumer_lock); return ret; } static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r, void **array, int n) { int ret; spin_lock_irq(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_irq(&r->consumer_lock); return ret; } static inline int ptr_ring_consume_batched_any(struct ptr_ring *r, void **array, int n) { unsigned long flags; int ret; spin_lock_irqsave(&r->consumer_lock, flags); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_irqrestore(&r->consumer_lock, flags); return ret; } static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, void **array, int n) { int ret; spin_lock_bh(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_bh(&r->consumer_lock); return ret; } /* Cast to structure type and call a function without discarding from FIFO. * Function must return a value. * Callers must take consumer_lock. */ #define __PTR_RING_PEEK_CALL(r, f) ((f)(__ptr_ring_peek(r))) #define PTR_RING_PEEK_CALL(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_IRQ(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock_irq(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_irq(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_BH(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock_bh(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_bh(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_ANY(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ unsigned long __PTR_RING_PEEK_CALL_f;\ \ spin_lock_irqsave(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_irqrestore(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ __PTR_RING_PEEK_CALL_v; \ }) /* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See * documentation for vmalloc for which of them are legal. */ static inline void **__ptr_ring_init_queue_alloc_noprof(unsigned int size, gfp_t gfp) { if (size > KMALLOC_MAX_SIZE / sizeof(void *)) return NULL; return kvmalloc_array_noprof(size, sizeof(void *), gfp | __GFP_ZERO); } static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) { r->size = size; r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue)); /* We need to set batch at least to 1 to make logic * in __ptr_ring_discard_one work correctly. * Batching too much (because ring is small) would cause a lot of * burstiness. Needs tuning, for now disable batching. */ if (r->batch > r->size / 2 || !r->batch) r->batch = 1; } static inline int ptr_ring_init_noprof(struct ptr_ring *r, int size, gfp_t gfp) { r->queue = __ptr_ring_init_queue_alloc_noprof(size, gfp); if (!r->queue) return -ENOMEM; __ptr_ring_set_size(r, size); r->producer = r->consumer_head = r->consumer_tail = 0; spin_lock_init(&r->producer_lock); spin_lock_init(&r->consumer_lock); return 0; } #define ptr_ring_init(...) alloc_hooks(ptr_ring_init_noprof(__VA_ARGS__)) /* * Return entries into ring. Destroy entries that don't fit. * * Note: this is expected to be a rare slow path operation. * * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n, void (*destroy)(void *)) { unsigned long flags; spin_lock_irqsave(&r->consumer_lock, flags); spin_lock(&r->producer_lock); if (!r->size) goto done; /* * Clean out buffered entries (for simplicity). This way following code * can test entries for NULL and if not assume they are valid. */ __ptr_ring_zero_tail(r, r->consumer_head); /* * Go over entries in batch, start moving head back and copy entries. * Stop when we run into previously unconsumed entries. */ while (n) { int head = r->consumer_head - 1; if (head < 0) head = r->size - 1; if (r->queue[head]) { /* This batch entry will have to be destroyed. */ goto done; } r->queue[head] = batch[--n]; r->consumer_tail = head; /* matching READ_ONCE in __ptr_ring_empty for lockless tests */ WRITE_ONCE(r->consumer_head, head); } done: /* Destroy all entries left in the batch. */ while (n) destroy(batch[--n]); spin_unlock(&r->producer_lock); spin_unlock_irqrestore(&r->consumer_lock, flags); } static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, int size, gfp_t gfp, void (*destroy)(void *)) { int producer = 0; void **old; void *ptr; while ((ptr = __ptr_ring_consume(r))) if (producer < size) queue[producer++] = ptr; else if (destroy) destroy(ptr); if (producer >= size) producer = 0; __ptr_ring_set_size(r, size); r->producer = producer; r->consumer_head = 0; r->consumer_tail = 0; old = r->queue; r->queue = queue; return old; } /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp, void (*destroy)(void *)) { unsigned long flags; void **queue = __ptr_ring_init_queue_alloc_noprof(size, gfp); void **old; if (!queue) return -ENOMEM; spin_lock_irqsave(&(r)->consumer_lock, flags); spin_lock(&(r)->producer_lock); old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy); spin_unlock(&(r)->producer_lock); spin_unlock_irqrestore(&(r)->consumer_lock, flags); kvfree(old); return 0; } #define ptr_ring_resize(...) alloc_hooks(ptr_ring_resize_noprof(__VA_ARGS__)) /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in BH context, you must * disable BH when doing so. */ static inline int ptr_ring_resize_multiple_bh_noprof(struct ptr_ring **rings, unsigned int nrings, int size, gfp_t gfp, void (*destroy)(void *)) { void ***queues; int i; queues = kmalloc_array_noprof(nrings, sizeof(*queues), gfp); if (!queues) goto noqueues; for (i = 0; i < nrings; ++i) { queues[i] = __ptr_ring_init_queue_alloc_noprof(size, gfp); if (!queues[i]) goto nomem; } for (i = 0; i < nrings; ++i) { spin_lock_bh(&(rings[i])->consumer_lock); spin_lock(&(rings[i])->producer_lock); queues[i] = __ptr_ring_swap_queue(rings[i], queues[i], size, gfp, destroy); spin_unlock(&(rings[i])->producer_lock); spin_unlock_bh(&(rings[i])->consumer_lock); } for (i = 0; i < nrings; ++i) kvfree(queues[i]); kfree(queues); return 0; nomem: while (--i >= 0) kvfree(queues[i]); kfree(queues); noqueues: return -ENOMEM; } #define ptr_ring_resize_multiple_bh(...) \ alloc_hooks(ptr_ring_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) { void *ptr; if (destroy) while ((ptr = ptr_ring_consume(r))) destroy(ptr); kvfree(r->queue); } #endif /* _LINUX_PTR_RING_H */
107 4 105 2 2 2 2 2 2 335 752 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions implement sctp stream message interleaving, mostly * including I-DATA and I-FORWARD-TSN chunks process. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <net/busy_poll.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/ulpevent.h> #include <linux/sctp.h> static struct sctp_chunk *sctp_make_idatafrag_empty( const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_idatahdr dp; memset(&dp, 0, sizeof(dp)); dp.stream = htons(sinfo->sinfo_stream); if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } static void sctp_chunk_assign_mid(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; __u32 cfsn = 0; __u16 sid; if (chunk->has_mid) return; sid = sctp_chunk_stream_no(chunk); stream = &chunk->asoc->stream; list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) { struct sctp_idatahdr *hdr; __u32 mid; lchunk->has_mid = 1; hdr = lchunk->subh.idata_hdr; if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) hdr->ppid = lchunk->sinfo.sinfo_ppid; else hdr->fsn = htonl(cfsn++); if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_uo_next(stream, out, sid) : sctp_mid_uo_peek(stream, out, sid); } else { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_next(stream, out, sid) : sctp_mid_peek(stream, out, sid); } hdr->mid = htonl(mid); } } static bool sctp_validate_data(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u16 sid, ssn; if (chunk->chunk_hdr->type != SCTP_CID_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); ssn = ntohs(chunk->subh.data_hdr->ssn); return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)); } static bool sctp_validate_idata(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u32 mid; __u16 sid; if (chunk->chunk_hdr->type != SCTP_CID_I_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); mid = ntohl(chunk->subh.idata_hdr->mid); return !MID_lt(mid, sctp_mid_peek(stream, in, sid)); } static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->reasm); if (!pos) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) { loc = pos; break; } if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); else __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream || cevent->mid != sin->mid) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (cevent->mid == sin->mid) { pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, pd_first, pd_last); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode && event->mid == sin->mid && event->fsn == sin->fsn) retval = sctp_intl_retrieve_partial(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled(ulpq, event); return retval; } static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->lobby); if (!pos) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } cevent = (struct sctp_ulpevent *)pos->cb; if (event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } if (event->stream > cevent->stream) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->lobby, pos) { cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > event->stream) { loc = pos; break; } if (cevent->stream == event->stream && MID_lt(event->mid, cevent->mid)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); else __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event)); } static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head *event_list; struct sctp_stream *stream; struct sk_buff *pos, *tmp; __u16 sid = event->stream; stream = &ulpq->asoc->stream; event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev; sctp_skb_for_each(pos, &ulpq->lobby, tmp) { struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > sid) break; if (cevent->stream < sid) continue; if (cevent->mid != sctp_mid_peek(stream, in, sid)) break; sctp_mid_next(stream, in, sid); __skb_unlink(pos, &ulpq->lobby); __skb_queue_tail(event_list, pos); } } static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_stream *stream; __u16 sid; stream = &ulpq->asoc->stream; sid = event->stream; if (event->mid != sctp_mid_peek(stream, in, sid)) { sctp_intl_store_ordered(ulpq, event); return NULL; } sctp_mid_next(stream, in, sid); sctp_intl_retrieve_ordered(ulpq, event); return event; } static int sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); struct sctp_ulpevent *event; struct sk_buff *skb; skb = __skb_peek(skb_list); event = sctp_skb2event(skb); if (sk->sk_shutdown & RCV_SHUTDOWN && (sk->sk_shutdown & SEND_SHUTDOWN || !sctp_ulpevent_is_notification(event))) goto out_free; if (!sctp_ulpevent_is_notification(event)) { sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; skb_queue_splice_tail_init(skb_list, &sk->sk_receive_queue); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } return 1; out_free: sctp_queue_purge_ulpevents(skb_list); return 0; } static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos; pos = skb_peek_tail(&ulpq->reasm_uo); if (!pos) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } skb_queue_walk(&ulpq->reasm_uo, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) break; if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) break; } __skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, sin->mid_uo)) continue; if (MID_lt(sin->mid_uo, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode_uo = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (!sin->pd_mode_uo) { sin->mid_uo = cevent->mid; pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, pd_first, pd_last); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm_uo(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode_uo && event->mid == sin->mid_uo && event->fsn == sin->fsn_uo) retval = sctp_intl_retrieve_partial_uo(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled_uo(ulpq, event); return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode_uo) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; sin->mid_uo = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid_uo && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } return retval; } static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; int event_eor = 0; event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); if (!event) return -ENOMEM; event->mid = ntohl(chunk->subh.idata_hdr->mid); if (event->msg_flags & SCTP_DATA_FIRST_FRAG) event->ppid = chunk->subh.idata_hdr->ppid; else event->fsn = ntohl(chunk->subh.idata_hdr->fsn); if (!(event->msg_flags & SCTP_DATA_UNORDERED)) { event = sctp_intl_reasm(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); if (event->msg_flags & MSG_EOR) event = sctp_intl_order(ulpq, event); } } else { event = sctp_intl_reasm_uo(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); } } if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; sctp_enqueue_event(ulpq, &temp); } return event_eor; } static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; if (cevent->mid == csin->mid) { first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; } break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } return retval; } static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; if (!skb_queue_empty(&ulpq->reasm)) { do { event = sctp_intl_retrieve_first(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } if (!skb_queue_empty(&ulpq->reasm_uo)) { do { event = sctp_intl_retrieve_first_uo(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } } static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc = ulpq->asoc; __u32 freed = 0; __u16 needed; needed = ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_idata_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo, needed); } if (freed >= needed && sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0) sctp_intl_start_pd(ulpq, gfp); } static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u16 flags, gfp_t gfp) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_ulpevent *ev = NULL; if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, SCTP_PARTIAL_DELIVERY_EVENT)) return; ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, sid, mid, flags, gfp); if (ev) { struct sctp_sock *sp = sctp_sk(sk); __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } } } static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) { struct sctp_stream *stream = &ulpq->asoc->stream; struct sctp_ulpevent *cevent, *event = NULL; struct sk_buff_head *lobby = &ulpq->lobby; struct sk_buff *pos, *tmp; struct sk_buff_head temp; __u16 csid; __u32 cmid; skb_queue_head_init(&temp); sctp_skb_for_each(pos, lobby, tmp) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid > sid) break; if (csid < sid) continue; if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid))) break; __skb_unlink(pos, lobby); if (!event) event = sctp_skb2event(pos); __skb_queue_tail(&temp, pos); } if (!event && pos != (struct sk_buff *)lobby) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) { sctp_mid_next(stream, in, csid); __skb_unlink(pos, lobby); __skb_queue_tail(&temp, pos); event = sctp_skb2event(pos); } } if (event) { sctp_intl_retrieve_ordered(ulpq, event); sctp_enqueue_event(ulpq, &temp); } } static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_stream *stream = &ulpq->asoc->stream; __u16 sid; for (sid = 0; sid < stream->incnt; sid++) { struct sctp_stream_in *sin = SCTP_SI(stream, sid); __u32 mid; if (sin->pd_mode_uo) { sin->pd_mode_uo = 0; mid = sin->mid_uo; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp); } if (sin->pd_mode) { sin->pd_mode = 0; mid = sin->mid; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp); sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } } /* intl abort pd happens only when all data needs to be cleaned */ sctp_ulpq_flush(ulpq); } static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist, int nskips, __be16 stream, __u8 flags) { int i; for (i = 0; i < nskips; i++) if (skiplist[i].stream == stream && skiplist[i].flags == flags) return i; return i; } #define SCTP_FTSN_U_BIT 0x1 static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_ifwdtsn_skip ftsn_skip_arr[10]; struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; struct list_head *lchunk, *temp; int nskips = 0, skip_pos; struct sctp_chunk *chunk; __u32 tsn; if (!asoc->peer.prsctp_capable) return; if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) asoc->adv_peer_ack_point = ctsn; list_for_each_safe(lchunk, temp, &q->abandoned) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(chunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(lchunk); sctp_chunk_free(chunk); } else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) { __be16 sid = chunk->subh.idata_hdr->stream; __be32 mid = chunk->subh.idata_hdr->mid; __u8 flags = 0; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) flags |= SCTP_FTSN_U_BIT; asoc->adv_peer_ack_point = tsn; skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, sid, flags); ftsn_skip_arr[skip_pos].stream = sid; ftsn_skip_arr[skip_pos].reserved = 0; ftsn_skip_arr[skip_pos].flags = flags; ftsn_skip_arr[skip_pos].mid = mid; if (skip_pos == nskips) nskips++; if (nskips == 10) break; } else { break; } } if (asoc->adv_peer_ack_point > ctsn) ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point, nskips, &ftsn_skip_arr[0]); if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } #define _sctp_walk_ifwdtsn(pos, chunk, end) \ for (pos = (void *)(chunk->subh.ifwdtsn_hdr + 1); \ (void *)pos <= (void *)(chunk->subh.ifwdtsn_hdr + 1) + (end) - \ sizeof(struct sctp_ifwdtsn_skip); pos++) #define sctp_walk_ifwdtsn(pos, ch) \ _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \ sizeof(struct sctp_ifwdtsn_chunk)) static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_fwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static bool sctp_validate_iftsn(struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_ifwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_ulpq_reasm_flushtsn(ulpq, ftsn); /* Abort any in progress partial delivery. */ sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { struct sk_buff *pos, *tmp; skb_queue_walk_safe(&ulpq->reasm, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm); sctp_ulpevent_free(event); } } skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm_uo); sctp_ulpevent_free(event); } } } static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_intl_reasm_flushtsn(ulpq, ftsn); /* abort only when it's for all data */ if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map)) sctp_intl_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; /* Walk through all the skipped SSNs */ sctp_walk_fwdtsn(skip, chunk) sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); } static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u8 flags) { struct sctp_stream_in *sin = sctp_stream_in(&ulpq->asoc->stream, sid); struct sctp_stream *stream = &ulpq->asoc->stream; if (flags & SCTP_FTSN_U_BIT) { if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) { sin->pd_mode_uo = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, GFP_ATOMIC); } return; } if (MID_lt(mid, sctp_mid_peek(stream, in, sid))) return; if (sin->pd_mode) { sin->pd_mode = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC); } sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; /* Walk through all the skipped MIDs and abort stream pd if possible */ sctp_walk_ifwdtsn(skip, chunk) sctp_intl_skip(ulpq, ntohs(skip->stream), ntohl(skip->mid), skip->flags); } static int do_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_ulpq_tail_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_0 = { .data_chunk_len = sizeof(struct sctp_data_chunk), .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk), /* DATA process functions */ .make_datafrag = sctp_make_datafrag_empty, .assign_number = sctp_chunk_assign_ssn, .validate_data = sctp_validate_data, .ulpevent_data = sctp_ulpq_tail_data, .enqueue_event = do_ulpq_tail_event, .renege_events = sctp_ulpq_renege, .start_pd = sctp_ulpq_partial_delivery, .abort_pd = sctp_ulpq_abort_pd, /* FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_fwdtsn, .validate_ftsn = sctp_validate_fwdtsn, .report_ftsn = sctp_report_fwdtsn, .handle_ftsn = sctp_handle_fwdtsn, }; static int do_sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_enqueue_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_1 = { .data_chunk_len = sizeof(struct sctp_idata_chunk), .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk), /* I-DATA process functions */ .make_datafrag = sctp_make_idatafrag_empty, .assign_number = sctp_chunk_assign_mid, .validate_data = sctp_validate_idata, .ulpevent_data = sctp_ulpevent_idata, .enqueue_event = do_sctp_enqueue_event, .renege_events = sctp_renege_events, .start_pd = sctp_intl_start_pd, .abort_pd = sctp_intl_abort_pd, /* I-FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_iftsn, .validate_ftsn = sctp_validate_iftsn, .report_ftsn = sctp_report_iftsn, .handle_ftsn = sctp_handle_iftsn, }; void sctp_stream_interleave_init(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); stream->si = asoc->peer.intl_capable ? &sctp_stream_interleave_1 : &sctp_stream_interleave_0; }
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2017 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if.h> #include <linux/skbuff.h> #include <linux/ip.h> #include <linux/kernel.h> #include <linux/openvswitch.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <net/netlink.h> #include <net/genetlink.h> #include "datapath.h" #include "meter.h" static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED }, [OVS_METER_ATTR_USED] = { .type = NLA_U64 }, [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG }, [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 }, [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 }, }; static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, }, [OVS_BAND_ATTR_RATE] = { .type = NLA_U32, }, [OVS_BAND_ATTR_BURST] = { .type = NLA_U32, }, [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, }; static u32 meter_hash(struct dp_meter_instance *ti, u32 id) { return id % ti->n_meters; } static void ovs_meter_free(struct dp_meter *meter) { if (!meter) return; kfree_rcu(meter, rcu); } /* Call with ovs_mutex or RCU read lock. */ static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, u32 meter_id) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); u32 hash = meter_hash(ti, meter_id); struct dp_meter *meter; meter = rcu_dereference_ovsl(ti->dp_meters[hash]); if (meter && likely(meter->id == meter_id)) return meter; return NULL; } static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) { struct dp_meter_instance *ti; ti = kvzalloc(struct_size(ti, dp_meters, size), GFP_KERNEL); if (!ti) return NULL; ti->n_meters = size; return ti; } static void dp_meter_instance_free(struct dp_meter_instance *ti) { kvfree(ti); } static void dp_meter_instance_free_rcu(struct rcu_head *rcu) { struct dp_meter_instance *ti; ti = container_of(rcu, struct dp_meter_instance, rcu); kvfree(ti); } static int dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); int n_meters = min(size, ti->n_meters); struct dp_meter_instance *new_ti; int i; new_ti = dp_meter_instance_alloc(size); if (!new_ti) return -ENOMEM; for (i = 0; i < n_meters; i++) if (rcu_dereference_ovsl(ti->dp_meters[i])) new_ti->dp_meters[i] = ti->dp_meters[i]; rcu_assign_pointer(tbl->ti, new_ti); call_rcu(&ti->rcu, dp_meter_instance_free_rcu); return 0; } static void dp_meter_instance_insert(struct dp_meter_instance *ti, struct dp_meter *meter) { u32 hash; hash = meter_hash(ti, meter->id); rcu_assign_pointer(ti->dp_meters[hash], meter); } static void dp_meter_instance_remove(struct dp_meter_instance *ti, struct dp_meter *meter) { u32 hash; hash = meter_hash(ti, meter->id); RCU_INIT_POINTER(ti->dp_meters[hash], NULL); } static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); u32 hash = meter_hash(ti, meter->id); int err; /* In generally, slots selected should be empty, because * OvS uses id-pool to fetch a available id. */ if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash]))) return -EBUSY; dp_meter_instance_insert(ti, meter); /* That function is thread-safe. */ tbl->count++; if (tbl->count >= tbl->max_meters_allowed) { err = -EFBIG; goto attach_err; } if (tbl->count >= ti->n_meters && dp_meter_instance_realloc(tbl, ti->n_meters * 2)) { err = -ENOMEM; goto attach_err; } return 0; attach_err: dp_meter_instance_remove(ti, meter); tbl->count--; return err; } static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) { struct dp_meter_instance *ti; ASSERT_OVSL(); if (!meter) return 0; ti = rcu_dereference_ovsl(tbl->ti); dp_meter_instance_remove(ti, meter); tbl->count--; /* Shrink the meter array if necessary. */ if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN && tbl->count <= (ti->n_meters / 4)) { int half_size = ti->n_meters / 2; int i; /* Avoid hash collision, don't move slots to other place. * Make sure there are no references of meters in array * which will be released. */ for (i = half_size; i < ti->n_meters; i++) if (rcu_dereference_ovsl(ti->dp_meters[i])) goto out; if (dp_meter_instance_realloc(tbl, half_size)) goto shrink_err; } out: return 0; shrink_err: dp_meter_instance_insert(ti, meter); tbl->count++; return -ENOMEM; } static struct sk_buff * ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, struct ovs_header **ovs_reply_header) { struct sk_buff *skb; struct ovs_header *ovs_header = genl_info_userhdr(info); skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return ERR_PTR(-ENOMEM); *ovs_reply_header = genlmsg_put(skb, info->snd_portid, info->snd_seq, &dp_meter_genl_family, 0, cmd); if (!*ovs_reply_header) { nlmsg_free(skb); return ERR_PTR(-EMSGSIZE); } (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; return skb; } static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, struct dp_meter *meter) { struct nlattr *nla; struct dp_meter_band *band; u16 i; if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) goto error; if (nla_put(reply, OVS_METER_ATTR_STATS, sizeof(struct ovs_flow_stats), &meter->stats)) goto error; if (nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, OVS_METER_ATTR_PAD)) goto error; nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); if (!nla) goto error; band = meter->bands; for (i = 0; i < meter->n_bands; ++i, ++band) { struct nlattr *band_nla; band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); if (!band_nla || nla_put(reply, OVS_BAND_ATTR_STATS, sizeof(struct ovs_flow_stats), &band->stats)) goto error; nla_nest_end(reply, band_nla); } nla_nest_end(reply, nla); return 0; error: return -EMSGSIZE; } static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr *nla, *band_nla; struct sk_buff *reply; struct datapath *dp; int err = -EMSGSIZE; reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, dp->meter_tbl.max_meters_allowed)) goto exit_unlock; ovs_unlock(); if (nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) goto nla_put_failure; nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); if (!nla) goto nla_put_failure; band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); if (!band_nla) goto nla_put_failure; /* Currently only DROP band type is supported. */ if (nla_put_u32(reply, OVS_BAND_ATTR_TYPE, OVS_METER_BAND_TYPE_DROP)) goto nla_put_failure; nla_nest_end(reply, band_nla); nla_nest_end(reply, nla); genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_unlock: ovs_unlock(); nla_put_failure: nlmsg_free(reply); return err; } static struct dp_meter *dp_meter_create(struct nlattr **a) { struct nlattr *nla; int rem; u16 n_bands = 0; struct dp_meter *meter; struct dp_meter_band *band; int err; /* Validate attributes, count the bands. */ if (!a[OVS_METER_ATTR_BANDS]) return ERR_PTR(-EINVAL); nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) if (++n_bands > DP_MAX_BANDS) return ERR_PTR(-EINVAL); /* Allocate and set up the meter before locking anything. */ meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL_ACCOUNT); if (!meter) return ERR_PTR(-ENOMEM); meter->id = nla_get_u32(a[OVS_METER_ATTR_ID]); meter->used = div_u64(ktime_get_ns(), 1000 * 1000); meter->kbps = a[OVS_METER_ATTR_KBPS] ? 1 : 0; meter->keep_stats = !a[OVS_METER_ATTR_CLEAR]; spin_lock_init(&meter->lock); if (meter->keep_stats && a[OVS_METER_ATTR_STATS]) { meter->stats = *(struct ovs_flow_stats *) nla_data(a[OVS_METER_ATTR_STATS]); } meter->n_bands = n_bands; /* Set up meter bands. */ band = meter->bands; nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) { struct nlattr *attr[OVS_BAND_ATTR_MAX + 1]; u32 band_max_delta_t; err = nla_parse_deprecated((struct nlattr **)&attr, OVS_BAND_ATTR_MAX, nla_data(nla), nla_len(nla), band_policy, NULL); if (err) goto exit_free_meter; if (!attr[OVS_BAND_ATTR_TYPE] || !attr[OVS_BAND_ATTR_RATE] || !attr[OVS_BAND_ATTR_BURST]) { err = -EINVAL; goto exit_free_meter; } band->type = nla_get_u32(attr[OVS_BAND_ATTR_TYPE]); band->rate = nla_get_u32(attr[OVS_BAND_ATTR_RATE]); if (band->rate == 0) { err = -EINVAL; goto exit_free_meter; } band->burst_size = nla_get_u32(attr[OVS_BAND_ATTR_BURST]); /* Figure out max delta_t that is enough to fill any bucket. * Keep max_delta_t size to the bucket units: * pkts => 1/1000 packets, kilobits => bits. * * Start with a full bucket. */ band->bucket = band->burst_size * 1000ULL; band_max_delta_t = div_u64(band->bucket, band->rate); if (band_max_delta_t > meter->max_delta_t) meter->max_delta_t = band_max_delta_t; band++; } return meter; exit_free_meter: kfree(meter); return ERR_PTR(err); } static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; struct dp_meter *meter, *old_meter; struct sk_buff *reply; struct ovs_header *ovs_reply_header; struct ovs_header *ovs_header = genl_info_userhdr(info); struct dp_meter_table *meter_tbl; struct datapath *dp; int err; u32 meter_id; bool failed; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; meter = dp_meter_create(a); if (IS_ERR(meter)) return PTR_ERR(meter); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, &ovs_reply_header); if (IS_ERR(reply)) { err = PTR_ERR(reply); goto exit_free_meter; } ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } meter_tbl = &dp->meter_tbl; meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); old_meter = lookup_meter(meter_tbl, meter_id); err = detach_meter(meter_tbl, old_meter); if (err) goto exit_unlock; err = attach_meter(meter_tbl, meter); if (err) goto exit_free_old_meter; ovs_unlock(); /* Build response with the meter_id and stats from * the old meter, if any. */ failed = nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id); WARN_ON(failed); if (old_meter) { spin_lock_bh(&old_meter->lock); if (old_meter->keep_stats) { err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); } spin_unlock_bh(&old_meter->lock); ovs_meter_free(old_meter); } genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_free_old_meter: ovs_meter_free(old_meter); exit_unlock: ovs_unlock(); nlmsg_free(reply); exit_free_meter: kfree(meter); return err; } static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr **a = info->attrs; struct dp_meter *meter; struct sk_buff *reply; struct datapath *dp; u32 meter_id; int err; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_GET, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } /* Locate meter, copy stats. */ meter = lookup_meter(&dp->meter_tbl, meter_id); if (!meter) { err = -ENOENT; goto exit_unlock; } spin_lock_bh(&meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, meter); spin_unlock_bh(&meter->lock); if (err) goto exit_unlock; ovs_unlock(); genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_unlock: ovs_unlock(); nlmsg_free(reply); return err; } static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct ovs_header *ovs_header = genl_info_userhdr(info); struct ovs_header *ovs_reply_header; struct nlattr **a = info->attrs; struct dp_meter *old_meter; struct sk_buff *reply; struct datapath *dp; u32 meter_id; int err; if (!a[OVS_METER_ATTR_ID]) return -EINVAL; reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, &ovs_reply_header); if (IS_ERR(reply)) return PTR_ERR(reply); ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { err = -ENODEV; goto exit_unlock; } meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); old_meter = lookup_meter(&dp->meter_tbl, meter_id); if (old_meter) { spin_lock_bh(&old_meter->lock); err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); WARN_ON(err); spin_unlock_bh(&old_meter->lock); err = detach_meter(&dp->meter_tbl, old_meter); if (err) goto exit_unlock; } ovs_unlock(); ovs_meter_free(old_meter); genlmsg_end(reply, ovs_reply_header); return genlmsg_reply(reply, info); exit_unlock: ovs_unlock(); nlmsg_free(reply); return err; } /* Meter action execution. * * Return true 'meter_id' drop band is triggered. The 'skb' should be * dropped by the caller'. */ bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, u32 meter_id) { long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); long long int long_delta_ms; struct dp_meter_band *band; struct dp_meter *meter; int i, band_exceeded_max = -1; u32 band_exceeded_rate = 0; u32 delta_ms; u32 cost; meter = lookup_meter(&dp->meter_tbl, meter_id); /* Do not drop the packet when there is no meter. */ if (!meter) return false; /* Lock the meter while using it. */ spin_lock(&meter->lock); long_delta_ms = (now_ms - meter->used); /* ms */ if (long_delta_ms < 0) { /* This condition means that we have several threads fighting * for a meter lock, and the one who received the packets a * bit later wins. Assuming that all racing threads received * packets at the same time to avoid overflow. */ long_delta_ms = 0; } /* Make sure delta_ms will not be too large, so that bucket will not * wrap around below. */ delta_ms = (long_delta_ms > (long long int)meter->max_delta_t) ? meter->max_delta_t : (u32)long_delta_ms; /* Update meter statistics. */ meter->used = now_ms; meter->stats.n_packets += 1; meter->stats.n_bytes += skb->len; /* Bucket rate is either in kilobits per second, or in packets per * second. We maintain the bucket in the units of either bits or * 1/1000th of a packet, correspondingly. * Then, when rate is multiplied with milliseconds, we get the * bucket units: * msec * kbps = bits, and * msec * packets/sec = 1/1000 packets. * * 'cost' is the number of bucket units in this packet. */ cost = (meter->kbps) ? skb->len * 8 : 1000; /* Update all bands and find the one hit with the highest rate. */ for (i = 0; i < meter->n_bands; ++i) { long long int max_bucket_size; band = &meter->bands[i]; max_bucket_size = band->burst_size * 1000LL; band->bucket += delta_ms * band->rate; if (band->bucket > max_bucket_size) band->bucket = max_bucket_size; if (band->bucket >= cost) { band->bucket -= cost; } else if (band->rate > band_exceeded_rate) { band_exceeded_rate = band->rate; band_exceeded_max = i; } } if (band_exceeded_max >= 0) { /* Update band statistics. */ band = &meter->bands[band_exceeded_max]; band->stats.n_packets += 1; band->stats.n_bytes += skb->len; /* Drop band triggered, let the caller drop the 'skb'. */ if (band->type == OVS_METER_BAND_TYPE_DROP) { spin_unlock(&meter->lock); return true; } } spin_unlock(&meter->lock); return false; } static const struct genl_small_ops dp_meter_genl_ops[] = { { .cmd = OVS_METER_CMD_FEATURES, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_meter_cmd_features }, { .cmd = OVS_METER_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN * privilege. */ .doit = ovs_meter_cmd_set, }, { .cmd = OVS_METER_CMD_GET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ .doit = ovs_meter_cmd_get, }, { .cmd = OVS_METER_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN * privilege. */ .doit = ovs_meter_cmd_del }, }; static const struct genl_multicast_group ovs_meter_multicast_group = { .name = OVS_METER_MCGROUP, }; struct genl_family dp_meter_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), .name = OVS_METER_FAMILY, .version = OVS_METER_VERSION, .maxattr = OVS_METER_ATTR_MAX, .policy = meter_policy, .netnsok = true, .parallel_ops = true, .small_ops = dp_meter_genl_ops, .n_small_ops = ARRAY_SIZE(dp_meter_genl_ops), .resv_start_op = OVS_METER_CMD_GET + 1, .mcgrps = &ovs_meter_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, }; int ovs_meters_init(struct datapath *dp) { struct dp_meter_table *tbl = &dp->meter_tbl; struct dp_meter_instance *ti; unsigned long free_mem_bytes; ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); if (!ti) return -ENOMEM; /* Allow meters in a datapath to use ~3.12% of physical memory. */ free_mem_bytes = nr_free_buffer_pages() * (PAGE_SIZE >> 5); tbl->max_meters_allowed = min(free_mem_bytes / sizeof(struct dp_meter), DP_METER_NUM_MAX); if (!tbl->max_meters_allowed) goto out_err; rcu_assign_pointer(tbl->ti, ti); tbl->count = 0; return 0; out_err: dp_meter_instance_free(ti); return -ENOMEM; } void ovs_meters_exit(struct datapath *dp) { struct dp_meter_table *tbl = &dp->meter_tbl; struct dp_meter_instance *ti = rcu_dereference_raw(tbl->ti); int i; for (i = 0; i < ti->n_meters; i++) ovs_meter_free(rcu_dereference_raw(ti->dp_meters[i])); dp_meter_instance_free(ti); }
6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_IOBITMAP_H #define _ASM_X86_IOBITMAP_H #include <linux/refcount.h> #include <asm/processor.h> struct io_bitmap { u64 sequence; refcount_t refcnt; /* The maximum number of bytes to copy so all zero bits are covered */ unsigned int max; unsigned long bitmap[IO_BITMAP_LONGS]; }; struct task_struct; #ifdef CONFIG_X86_IOPL_IOPERM void io_bitmap_share(struct task_struct *tsk); void io_bitmap_exit(struct task_struct *tsk); static inline void native_tss_invalidate_io_bitmap(void) { /* * Invalidate the I/O bitmap by moving io_bitmap_base outside the * TSS limit so any subsequent I/O access from user space will * trigger a #GP. * * This is correct even when VMEXIT rewrites the TSS limit * to 0x67 as the only requirement is that the base points * outside the limit. */ this_cpu_write(cpu_tss_rw.x86_tss.io_bitmap_base, IO_BITMAP_OFFSET_INVALID); } void native_tss_update_io_bitmap(void); #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #define tss_update_io_bitmap native_tss_update_io_bitmap #define tss_invalidate_io_bitmap native_tss_invalidate_io_bitmap #endif #else static inline void io_bitmap_share(struct task_struct *tsk) { } static inline void io_bitmap_exit(struct task_struct *tsk) { } static inline void tss_update_io_bitmap(void) { } #endif #endif
112 124 108 102 9 108 26 124 124 35 37 36 160 1 160 160 7 2 5 6 3 10 10 7 3 40 41 20 20 41 139 126 25 16 2 24 5 5 1 5 5 2 3 3 4 1 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 // SPDX-License-Identifier: GPL-2.0-or-later /* * Spanning tree protocol; interface code * Linux ethernet bridge * * Authors: * Lennert Buytenhek <buytenh@gnu.org> */ #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/switchdev.h> #include "br_private.h" #include "br_private_stp.h" /* Port id is composed of priority and port number. * NB: some bits of priority are dropped to * make room for more ports. */ static inline port_id br_make_port_id(__u8 priority, __u16 port_no) { return ((u16)priority << BR_PORT_BITS) | (port_no & ((1<<BR_PORT_BITS)-1)); } #define BR_MAX_PORT_PRIORITY ((u16)~0 >> BR_PORT_BITS) /* called under bridge lock */ void br_init_port(struct net_bridge_port *p) { int err; p->port_id = br_make_port_id(p->priority, p->port_no); br_become_designated_port(p); br_set_state(p, BR_STATE_BLOCKING); p->topology_change_ack = 0; p->config_pending = 0; err = __set_ageing_time(p->dev, p->br->ageing_time); if (err) netdev_err(p->dev, "failed to offload ageing time\n"); } /* NO locks held */ void br_stp_enable_bridge(struct net_bridge *br) { struct net_bridge_port *p; spin_lock_bh(&br->lock); if (br->stp_enabled == BR_KERNEL_STP) mod_timer(&br->hello_timer, jiffies + br->hello_time); mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10); br_config_bpdu_generation(br); list_for_each_entry(p, &br->port_list, list) { if (netif_running(p->dev) && netif_oper_up(p->dev)) br_stp_enable_port(p); } spin_unlock_bh(&br->lock); } /* NO locks held */ void br_stp_disable_bridge(struct net_bridge *br) { struct net_bridge_port *p; spin_lock_bh(&br->lock); list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED) br_stp_disable_port(p); } __br_set_topology_change(br, 0); br->topology_change_detected = 0; spin_unlock_bh(&br->lock); timer_delete_sync(&br->hello_timer); timer_delete_sync(&br->topology_change_timer); timer_delete_sync(&br->tcn_timer); cancel_delayed_work_sync(&br->gc_work); } /* called under bridge lock */ void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); br_port_state_selection(p->br); br_ifinfo_notify(RTM_NEWLINK, NULL, p); } /* called under bridge lock */ void br_stp_disable_port(struct net_bridge_port *p) { struct net_bridge *br = p->br; int wasroot; wasroot = br_is_root_bridge(br); br_become_designated_port(p); br_set_state(p, BR_STATE_DISABLED); p->topology_change_ack = 0; p->config_pending = 0; br_ifinfo_notify(RTM_NEWLINK, NULL, p); timer_delete(&p->message_age_timer); timer_delete(&p->forward_delay_timer); timer_delete(&p->hold_timer); if (!rcu_access_pointer(p->backup_port)) br_fdb_delete_by_port(br, p, 0, 0); br_multicast_disable_port(p); br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); } static int br_stp_call_user(struct net_bridge *br, char *arg) { char *argv[] = { BR_STP_PROG, br->dev->name, arg, NULL }; char *envp[] = { NULL }; int rc; /* call userspace STP and report program errors */ rc = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC); if (rc > 0) { if (rc & 0xff) br_debug(br, BR_STP_PROG " received signal %d\n", rc & 0x7f); else br_debug(br, BR_STP_PROG " exited with code %d\n", (rc >> 8) & 0xff); } return rc; } static void br_stp_start(struct net_bridge *br) { int err = -ENOENT; if (net_eq(dev_net(br->dev), &init_net)) err = br_stp_call_user(br, "start"); if (err && err != -ENOENT) br_err(br, "failed to start userspace STP (%d)\n", err); spin_lock_bh(&br->lock); if (br->bridge_forward_delay < BR_MIN_FORWARD_DELAY) __br_set_forward_delay(br, BR_MIN_FORWARD_DELAY); else if (br->bridge_forward_delay > BR_MAX_FORWARD_DELAY) __br_set_forward_delay(br, BR_MAX_FORWARD_DELAY); if (!err) { br->stp_enabled = BR_USER_STP; br_debug(br, "userspace STP started\n"); } else { br->stp_enabled = BR_KERNEL_STP; br_debug(br, "using kernel STP\n"); /* To start timers on any ports left in blocking */ if (br->dev->flags & IFF_UP) mod_timer(&br->hello_timer, jiffies + br->hello_time); br_port_state_selection(br); } spin_unlock_bh(&br->lock); } static void br_stp_stop(struct net_bridge *br) { int err; if (br->stp_enabled == BR_USER_STP) { err = br_stp_call_user(br, "stop"); if (err) br_err(br, "failed to stop userspace STP (%d)\n", err); /* To start timers on any ports left in blocking */ spin_lock_bh(&br->lock); br_port_state_selection(br); spin_unlock_bh(&br->lock); } br->stp_enabled = BR_NO_STP; } int br_stp_set_enabled(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { ASSERT_RTNL(); if (br_mrp_enabled(br)) { NL_SET_ERR_MSG_MOD(extack, "STP can't be enabled if MRP is already enabled"); return -EINVAL; } if (val) { if (br->stp_enabled == BR_NO_STP) br_stp_start(br); } else { if (br->stp_enabled != BR_NO_STP) br_stp_stop(br); } return 0; } /* called under bridge lock */ void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) { /* should be aligned on 2 bytes for ether_addr_equal() */ unsigned short oldaddr_aligned[ETH_ALEN >> 1]; unsigned char *oldaddr = (unsigned char *)oldaddr_aligned; struct net_bridge_port *p; int wasroot; wasroot = br_is_root_bridge(br); br_fdb_change_mac_address(br, addr); memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN); memcpy(br->bridge_id.addr, addr, ETH_ALEN); eth_hw_addr_set(br->dev, addr); list_for_each_entry(p, &br->port_list, list) { if (ether_addr_equal(p->designated_bridge.addr, oldaddr)) memcpy(p->designated_bridge.addr, addr, ETH_ALEN); if (ether_addr_equal(p->designated_root.addr, oldaddr)) memcpy(p->designated_root.addr, addr, ETH_ALEN); } br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); } /* should be aligned on 2 bytes for ether_addr_equal() */ static const unsigned short br_mac_zero_aligned[ETH_ALEN >> 1]; /* called under bridge lock */ bool br_stp_recalculate_bridge_id(struct net_bridge *br) { const unsigned char *br_mac_zero = (const unsigned char *)br_mac_zero_aligned; const unsigned char *addr = br_mac_zero; struct net_bridge_port *p; /* user has chosen a value so keep it */ if (br->dev->addr_assign_type == NET_ADDR_SET) return false; list_for_each_entry(p, &br->port_list, list) { if (addr == br_mac_zero || memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0) addr = p->dev->dev_addr; } if (ether_addr_equal(br->bridge_id.addr, addr)) return false; /* no change */ br_stp_change_bridge_id(br, addr); return true; } /* Acquires and releases bridge lock */ void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio) { struct net_bridge_port *p; int wasroot; spin_lock_bh(&br->lock); wasroot = br_is_root_bridge(br); list_for_each_entry(p, &br->port_list, list) { if (p->state != BR_STATE_DISABLED && br_is_designated_port(p)) { p->designated_bridge.prio[0] = (newprio >> 8) & 0xFF; p->designated_bridge.prio[1] = newprio & 0xFF; } } br->bridge_id.prio[0] = (newprio >> 8) & 0xFF; br->bridge_id.prio[1] = newprio & 0xFF; br_configuration_update(br); br_port_state_selection(br); if (br_is_root_bridge(br) && !wasroot) br_become_root_bridge(br); spin_unlock_bh(&br->lock); } /* called under bridge lock */ int br_stp_set_port_priority(struct net_bridge_port *p, unsigned long newprio) { port_id new_port_id; if (newprio > BR_MAX_PORT_PRIORITY) return -ERANGE; new_port_id = br_make_port_id(newprio, p->port_no); if (br_is_designated_port(p)) p->designated_port = new_port_id; p->port_id = new_port_id; p->priority = newprio; if (!memcmp(&p->br->bridge_id, &p->designated_bridge, 8) && p->port_id < p->designated_port) { br_become_designated_port(p); br_port_state_selection(p->br); } return 0; } /* called under bridge lock */ int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost) { if (path_cost < BR_MIN_PATH_COST || path_cost > BR_MAX_PATH_COST) return -ERANGE; p->flags |= BR_ADMIN_COST; p->path_cost = path_cost; br_configuration_update(p->br); br_port_state_selection(p->br); return 0; } ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id) { return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n", id->prio[0], id->prio[1], id->addr[0], id->addr[1], id->addr[2], id->addr[3], id->addr[4], id->addr[5]); }
135 135 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 // SPDX-License-Identifier: GPL-2.0-or-later /* * Do sleep inside a spin-lock * Copyright (c) 1999 by Takashi Iwai <tiwai@suse.de> */ #include <linux/export.h> #include <sound/core.h> #include "seq_lock.h" /* wait until all locks are released */ void snd_use_lock_sync_helper(snd_use_lock_t *lockp, const char *file, int line) { int warn_count = 5 * HZ; if (atomic_read(lockp) < 0) { pr_warn("ALSA: seq_lock: lock trouble [counter = %d] in %s:%d\n", atomic_read(lockp), file, line); return; } while (atomic_read(lockp) > 0) { if (warn_count-- == 0) pr_warn("ALSA: seq_lock: waiting [%d left] in %s:%d\n", atomic_read(lockp), file, line); schedule_timeout_uninterruptible(1); } } EXPORT_SYMBOL(snd_use_lock_sync_helper);
459 1047 1048 459 769 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 // SPDX-License-Identifier: GPL-2.0 /* * This file contains functions which manage clock event devices. * * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner */ #include <linux/clockchips.h> #include <linux/hrtimer.h> #include <linux/init.h> #include <linux/module.h> #include <linux/smp.h> #include <linux/device.h> #include "tick-internal.h" /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); static LIST_HEAD(clockevents_released); /* Protection for the above */ static DEFINE_RAW_SPINLOCK(clockevents_lock); /* Protection for unbind operations */ static DEFINE_MUTEX(clockevents_mutex); struct ce_unbind { struct clock_event_device *ce; int res; }; static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, bool ismax) { u64 clc = (u64) latch << evt->shift; u64 rnd; if (WARN_ON(!evt->mult)) evt->mult = 1; rnd = (u64) evt->mult - 1; /* * Upper bound sanity check. If the backwards conversion is * not equal latch, we know that the above shift overflowed. */ if ((clc >> evt->shift) != (u64)latch) clc = ~0ULL; /* * Scaled math oddities: * * For mult <= (1 << shift) we can safely add mult - 1 to * prevent integer rounding loss. So the backwards conversion * from nsec to device ticks will be correct. * * For mult > (1 << shift), i.e. device frequency is > 1GHz we * need to be careful. Adding mult - 1 will result in a value * which when converted back to device ticks can be larger * than latch by up to (mult - 1) >> shift. For the min_delta * calculation we still want to apply this in order to stay * above the minimum device ticks limit. For the upper limit * we would end up with a latch value larger than the upper * limit of the device, so we omit the add to stay below the * device upper boundary. * * Also omit the add if it would overflow the u64 boundary. */ if ((~0ULL - clc > rnd) && (!ismax || evt->mult <= (1ULL << evt->shift))) clc += rnd; do_div(clc, evt->mult); /* Deltas less than 1usec are pointless noise */ return clc > 1000 ? clc : 1000; } /** * clockevent_delta2ns - Convert a latch value (device ticks) to nanoseconds * @latch: value to convert * @evt: pointer to clock event device descriptor * * Math helper, returns latch value converted to nanoseconds (bound checked) */ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) { return cev_delta2ns(latch, evt, false); } EXPORT_SYMBOL_GPL(clockevent_delta2ns); static int __clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; /* Transition with new state-specific callbacks */ switch (state) { case CLOCK_EVT_STATE_DETACHED: /* The clockevent device is getting replaced. Shut it down. */ case CLOCK_EVT_STATE_SHUTDOWN: if (dev->set_state_shutdown) return dev->set_state_shutdown(dev); return 0; case CLOCK_EVT_STATE_PERIODIC: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) return -ENOSYS; if (dev->set_state_periodic) return dev->set_state_periodic(dev); return 0; case CLOCK_EVT_STATE_ONESHOT: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return -ENOSYS; if (dev->set_state_oneshot) return dev->set_state_oneshot(dev); return 0; case CLOCK_EVT_STATE_ONESHOT_STOPPED: /* Core internal bug */ if (WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev))) return -EINVAL; if (dev->set_state_oneshot_stopped) return dev->set_state_oneshot_stopped(dev); else return -ENOSYS; default: return -ENOSYS; } } /** * clockevents_switch_state - set the operating state of a clock event device * @dev: device to modify * @state: new state * * Must be called with interrupts disabled ! */ void clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { if (clockevent_get_state(dev) != state) { if (__clockevents_switch_state(dev, state)) return; clockevent_set_state(dev, state); /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: */ if (clockevent_state_oneshot(dev)) { if (WARN_ON(!dev->mult)) dev->mult = 1; } } } /** * clockevents_shutdown - shutdown the device and clear next_event * @dev: device to shutdown */ void clockevents_shutdown(struct clock_event_device *dev) { clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event = KTIME_MAX; } /** * clockevents_tick_resume - Resume the tick device before using it again * @dev: device to resume */ int clockevents_tick_resume(struct clock_event_device *dev) { int ret = 0; if (dev->tick_resume) ret = dev->tick_resume(dev); return ret; } #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST /* Limit min_delta to a jiffy */ #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) /** * clockevents_increase_min_delta - raise minimum delta of a clock event device * @dev: device to increase the minimum delta * * Returns 0 on success, -ETIME when the minimum delta reached the limit. */ static int clockevents_increase_min_delta(struct clock_event_device *dev) { /* Nothing to do if we already reached the limit */ if (dev->min_delta_ns >= MIN_DELTA_LIMIT) { printk_deferred(KERN_WARNING "CE: Reprogramming failure. Giving up\n"); dev->next_event = KTIME_MAX; return -ETIME; } if (dev->min_delta_ns < 5000) dev->min_delta_ns = 5000; else dev->min_delta_ns += dev->min_delta_ns >> 1; if (dev->min_delta_ns > MIN_DELTA_LIMIT) dev->min_delta_ns = MIN_DELTA_LIMIT; printk_deferred(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", dev->name ? dev->name : "?", (unsigned long long) dev->min_delta_ns); return 0; } /** * clockevents_program_min_delta - Set clock event device to the minimum delay. * @dev: device to program * * Returns 0 on success, -ETIME when the retry loop failed. */ static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; int64_t delta; int i; for (i = 0;;) { delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); if (clockevent_state_shutdown(dev)) return 0; dev->retries++; clc = ((unsigned long long) delta * dev->mult) >> dev->shift; if (dev->set_next_event((unsigned long) clc, dev) == 0) return 0; if (++i > 2) { /* * We tried 3 times to program the device with the * given min_delta_ns. Try to increase the minimum * delta, if that fails as well get out of here. */ if (clockevents_increase_min_delta(dev)) return -ETIME; i = 0; } } } #else /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ /** * clockevents_program_min_delta - Set clock event device to the minimum delay. * @dev: device to program * * Returns 0 on success, -ETIME when the retry loop failed. */ static int clockevents_program_min_delta(struct clock_event_device *dev) { unsigned long long clc; int64_t delta = 0; int i; for (i = 0; i < 10; i++) { delta += dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); if (clockevent_state_shutdown(dev)) return 0; dev->retries++; clc = ((unsigned long long) delta * dev->mult) >> dev->shift; if (dev->set_next_event((unsigned long) clc, dev) == 0) return 0; } return -ETIME; } #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ /** * clockevents_program_event - Reprogram the clock event device. * @dev: device to program * @expires: absolute expiry time (monotonic clock) * @force: program minimum delay if expires can not be set * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force) { unsigned long long clc; int64_t delta; int rc; if (WARN_ON_ONCE(expires < 0)) return -ETIME; dev->next_event = expires; if (clockevent_state_shutdown(dev)) return 0; /* We must be in ONESHOT state here */ WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev)); /* Shortcut for clockevent devices that can deal with ktime. */ if (dev->features & CLOCK_EVT_FEAT_KTIME) return dev->set_next_ktime(expires, dev); delta = ktime_to_ns(ktime_sub(expires, ktime_get())); if (delta <= 0) return force ? clockevents_program_min_delta(dev) : -ETIME; delta = min(delta, (int64_t) dev->max_delta_ns); delta = max(delta, (int64_t) dev->min_delta_ns); clc = ((unsigned long long) delta * dev->mult) >> dev->shift; rc = dev->set_next_event((unsigned long) clc, dev); return (rc && force) ? clockevents_program_min_delta(dev) : rc; } /* * Called after a clockevent has been added which might * have replaced a current regular or broadcast device. A * released normal device might be a suitable replacement * for the current broadcast device. Similarly a released * broadcast device might be a suitable replacement for a * normal device. */ static void clockevents_notify_released(void) { struct clock_event_device *dev; /* * Keep iterating as long as tick_check_new_device() * replaces a device. */ while (!list_empty(&clockevents_released)) { dev = list_entry(clockevents_released.next, struct clock_event_device, list); list_move(&dev->list, &clockevent_devices); tick_check_new_device(dev); } } /* * Try to install a replacement clock event device */ static int clockevents_replace(struct clock_event_device *ced) { struct clock_event_device *dev, *newdev = NULL; list_for_each_entry(dev, &clockevent_devices, list) { if (dev == ced || !clockevent_state_detached(dev)) continue; if (!tick_check_replacement(newdev, dev)) continue; if (!try_module_get(dev->owner)) continue; if (newdev) module_put(newdev->owner); newdev = dev; } if (newdev) { tick_install_replacement(newdev); list_del_init(&ced->list); } return newdev ? 0 : -EBUSY; } /* * Called with clockevents_mutex and clockevents_lock held */ static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { /* Fast track. Device is unused */ if (clockevent_state_detached(ced)) { list_del_init(&ced->list); return 0; } return ced == per_cpu(tick_cpu_device, cpu).evtdev ? -EAGAIN : -EBUSY; } /* * SMP function call to unbind a device */ static void __clockevents_unbind(void *arg) { struct ce_unbind *cu = arg; int res; raw_spin_lock(&clockevents_lock); res = __clockevents_try_unbind(cu->ce, smp_processor_id()); if (res == -EAGAIN) res = clockevents_replace(cu->ce); cu->res = res; raw_spin_unlock(&clockevents_lock); } /* * Issues smp function call to unbind a per cpu device. Called with * clockevents_mutex held. */ static int clockevents_unbind(struct clock_event_device *ced, int cpu) { struct ce_unbind cu = { .ce = ced, .res = -ENODEV }; smp_call_function_single(cpu, __clockevents_unbind, &cu, 1); return cu.res; } /* * Unbind a clockevents device. */ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) { int ret; mutex_lock(&clockevents_mutex); ret = clockevents_unbind(ced, cpu); mutex_unlock(&clockevents_mutex); return ret; } EXPORT_SYMBOL_GPL(clockevents_unbind_device); /** * clockevents_register_device - register a clock event device * @dev: device to register */ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; /* Initialize state to DETACHED */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); dev->cpumask = cpumask_of(smp_processor_id()); } if (dev->cpumask == cpu_all_mask) { WARN(1, "%s cpumask == cpu_all_mask, using cpu_possible_mask instead\n", dev->name); dev->cpumask = cpu_possible_mask; } raw_spin_lock_irqsave(&clockevents_lock, flags); list_add(&dev->list, &clockevent_devices); tick_check_new_device(dev); clockevents_notify_released(); raw_spin_unlock_irqrestore(&clockevents_lock, flags); } EXPORT_SYMBOL_GPL(clockevents_register_device); static void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return; /* * Calculate the maximum number of seconds we can sleep. Limit * to 10 minutes for hardware which can program more than * 32bit ticks so we still get reasonable conversion values. */ sec = dev->max_delta_ticks; do_div(sec, freq); if (!sec) sec = 1; else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) sec = 600; clockevents_calc_mult_shift(dev, freq, sec); dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); } /** * clockevents_config_and_register - Configure and register a clock event device * @dev: device to register * @freq: The clock frequency * @min_delta: The minimum clock ticks to program in oneshot mode * @max_delta: The maximum clock ticks to program in oneshot mode * * min/max_delta can be 0 for devices which do not support oneshot mode. */ void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta) { dev->min_delta_ticks = min_delta; dev->max_delta_ticks = max_delta; clockevents_config(dev, freq); clockevents_register_device(dev); } EXPORT_SYMBOL_GPL(clockevents_config_and_register); int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); if (clockevent_state_oneshot(dev)) return clockevents_program_event(dev, dev->next_event, false); if (clockevent_state_periodic(dev)) return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; } /** * clockevents_update_freq - Update frequency and reprogram a clock event device. * @dev: device to modify * @freq: new device frequency * * Reconfigure and reprogram a clock event device in oneshot * mode. Must be called on the cpu for which the device delivers per * cpu timer events. If called for the broadcast device the core takes * care of serialization. * * Returns 0 on success, -ETIME when the event is in the past. */ int clockevents_update_freq(struct clock_event_device *dev, u32 freq) { unsigned long flags; int ret; local_irq_save(flags); ret = tick_broadcast_update_freq(dev, freq); if (ret == -ENODEV) ret = __clockevents_update_freq(dev, freq); local_irq_restore(flags); return ret; } /* * Noop handler when we shut down an event device */ void clockevents_handle_noop(struct clock_event_device *dev) { } /** * clockevents_exchange_device - release and request clock devices * @old: device to release (can be NULL) * @new: device to request (can be NULL) * * Called from various tick functions with clockevents_lock held and * interrupts disabled. */ void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new) { /* * Caller releases a clock event device. We queue it into the * released list and do a notify add later. */ if (old) { module_put(old->owner); clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED); list_move(&old->list, &clockevents_released); } if (new) { BUG_ON(!clockevent_state_detached(new)); clockevents_shutdown(new); } } /** * clockevents_suspend - suspend clock devices */ void clockevents_suspend(void) { struct clock_event_device *dev; list_for_each_entry_reverse(dev, &clockevent_devices, list) if (dev->suspend && !clockevent_state_detached(dev)) dev->suspend(dev); } /** * clockevents_resume - resume clock devices */ void clockevents_resume(void) { struct clock_event_device *dev; list_for_each_entry(dev, &clockevent_devices, list) if (dev->resume && !clockevent_state_detached(dev)) dev->resume(dev); } #ifdef CONFIG_HOTPLUG_CPU /** * tick_offline_cpu - Shutdown all clock events related * to this CPU and take it out of the * broadcast mechanism. * @cpu: The outgoing CPU * * Called by the dying CPU during teardown. */ void tick_offline_cpu(unsigned int cpu) { struct clock_event_device *dev, *tmp; raw_spin_lock(&clockevents_lock); tick_broadcast_offline(cpu); tick_shutdown(); /* * Unregister the clock event devices which were * released above. */ list_for_each_entry_safe(dev, tmp, &clockevents_released, list) list_del(&dev->list); /* * Now check whether the CPU has left unused per cpu devices */ list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { if (cpumask_test_cpu(cpu, dev->cpumask) && cpumask_weight(dev->cpumask) == 1 && !tick_is_broadcast_device(dev)) { BUG_ON(!clockevent_state_detached(dev)); list_del(&dev->list); } } raw_spin_unlock(&clockevents_lock); } #endif #ifdef CONFIG_SYSFS static const struct bus_type clockevents_subsys = { .name = "clockevents", .dev_name = "clockevent", }; static DEFINE_PER_CPU(struct device, tick_percpu_dev); static struct tick_device *tick_get_tick_dev(struct device *dev); static ssize_t current_device_show(struct device *dev, struct device_attribute *attr, char *buf) { struct tick_device *td; ssize_t count = 0; raw_spin_lock_irq(&clockevents_lock); td = tick_get_tick_dev(dev); if (td && td->evtdev) count = sysfs_emit(buf, "%s\n", td->evtdev->name); raw_spin_unlock_irq(&clockevents_lock); return count; } static DEVICE_ATTR_RO(current_device); /* We don't support the abomination of removable broadcast devices */ static ssize_t unbind_device_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { char name[CS_NAME_LEN]; ssize_t ret = sysfs_get_uname(buf, name, count); struct clock_event_device *ce = NULL, *iter; if (ret < 0) return ret; ret = -ENODEV; mutex_lock(&clockevents_mutex); raw_spin_lock_irq(&clockevents_lock); list_for_each_entry(iter, &clockevent_devices, list) { if (!strcmp(iter->name, name)) { ret = __clockevents_try_unbind(iter, dev->id); ce = iter; break; } } raw_spin_unlock_irq(&clockevents_lock); /* * We hold clockevents_mutex, so ce can't go away */ if (ret == -EAGAIN) ret = clockevents_unbind(ce, dev->id); mutex_unlock(&clockevents_mutex); return ret ? ret : count; } static DEVICE_ATTR_WO(unbind_device); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST static struct device tick_bc_dev = { .init_name = "broadcast", .id = 0, .bus = &clockevents_subsys, }; static struct tick_device *tick_get_tick_dev(struct device *dev) { return dev == &tick_bc_dev ? tick_get_broadcast_device() : &per_cpu(tick_cpu_device, dev->id); } static __init int tick_broadcast_init_sysfs(void) { int err = device_register(&tick_bc_dev); if (!err) err = device_create_file(&tick_bc_dev, &dev_attr_current_device); return err; } #else static struct tick_device *tick_get_tick_dev(struct device *dev) { return &per_cpu(tick_cpu_device, dev->id); } static inline int tick_broadcast_init_sysfs(void) { return 0; } #endif static int __init tick_init_sysfs(void) { int cpu; for_each_possible_cpu(cpu) { struct device *dev = &per_cpu(tick_percpu_dev, cpu); int err; dev->id = cpu; dev->bus = &clockevents_subsys; err = device_register(dev); if (!err) err = device_create_file(dev, &dev_attr_current_device); if (!err) err = device_create_file(dev, &dev_attr_unbind_device); if (err) return err; } return tick_broadcast_init_sysfs(); } static int __init clockevents_init_sysfs(void) { int err = subsys_system_register(&clockevents_subsys, NULL); if (!err) err = tick_init_sysfs(); return err; } device_initcall(clockevents_init_sysfs); #endif /* SYSFS */
544 2 211 2 2 135 68 180 2 333 81 62 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SEQ_FILE_H #define _LINUX_SEQ_FILE_H #include <linux/types.h> #include <linux/string.h> #include <linux/string_helpers.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/nodemask.h> #include <linux/fs.h> #include <linux/cred.h> struct seq_operations; struct seq_file { char *buf; size_t size; size_t from; size_t count; size_t pad_until; loff_t index; loff_t read_pos; struct mutex lock; const struct seq_operations *op; int poll_event; const struct file *file; void *private; }; struct seq_operations { void * (*start) (struct seq_file *m, loff_t *pos); void (*stop) (struct seq_file *m, void *v); void * (*next) (struct seq_file *m, void *v, loff_t *pos); int (*show) (struct seq_file *m, void *v); }; #define SEQ_SKIP 1 /** * seq_has_overflowed - check if the buffer has overflowed * @m: the seq_file handle * * seq_files have a buffer which may overflow. When this happens a larger * buffer is reallocated and all the data will be printed again. * The overflow state is true when m->count == m->size. * * Returns true if the buffer received more than it can hold. */ static inline bool seq_has_overflowed(struct seq_file *m) { return m->count == m->size; } /** * seq_get_buf - get buffer to write arbitrary data to * @m: the seq_file handle * @bufp: the beginning of the buffer is stored here * * Return the number of bytes available in the buffer, or zero if * there's no space. */ static inline size_t seq_get_buf(struct seq_file *m, char **bufp) { BUG_ON(m->count > m->size); if (m->count < m->size) *bufp = m->buf + m->count; else *bufp = NULL; return m->size - m->count; } /** * seq_commit - commit data to the buffer * @m: the seq_file handle * @num: the number of bytes to commit * * Commit @num bytes of data written to a buffer previously acquired * by seq_buf_get. To signal an error condition, or that the data * didn't fit in the available space, pass a negative @num value. */ static inline void seq_commit(struct seq_file *m, int num) { if (num < 0) { m->count = m->size; } else { BUG_ON(m->count + num > m->size); m->count += num; } } /** * seq_setwidth - set padding width * @m: the seq_file handle * @size: the max number of bytes to pad. * * Call seq_setwidth() for setting max width, then call seq_printf() etc. and * finally call seq_pad() to pad the remaining bytes. */ static inline void seq_setwidth(struct seq_file *m, size_t size) { m->pad_until = m->count + size; } void seq_pad(struct seq_file *m, char c); char *mangle_path(char *s, const char *p, const char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); ssize_t seq_read_iter(struct kiocb *iocb, struct iov_iter *iter); loff_t seq_lseek(struct file *, loff_t, int); int seq_release(struct inode *, struct file *); int seq_write(struct seq_file *seq, const void *data, size_t len); __printf(2, 0) void seq_vprintf(struct seq_file *m, const char *fmt, va_list args); __printf(2, 3) void seq_printf(struct seq_file *m, const char *fmt, ...); void seq_putc(struct seq_file *m, char c); void __seq_puts(struct seq_file *m, const char *s); static __always_inline void seq_puts(struct seq_file *m, const char *s) { if (!__builtin_constant_p(*s)) __seq_puts(m, s); else if (s[0] && !s[1]) seq_putc(m, s[0]); else seq_write(m, s, __builtin_strlen(s)); } void seq_put_decimal_ull_width(struct seq_file *m, const char *delimiter, unsigned long long num, unsigned int width); void seq_put_decimal_ull(struct seq_file *m, const char *delimiter, unsigned long long num); void seq_put_decimal_ll(struct seq_file *m, const char *delimiter, long long num); void seq_put_hex_ll(struct seq_file *m, const char *delimiter, unsigned long long v, unsigned int width); void seq_escape_mem(struct seq_file *m, const char *src, size_t len, unsigned int flags, const char *esc); static inline void seq_escape_str(struct seq_file *m, const char *src, unsigned int flags, const char *esc) { seq_escape_mem(m, src, strlen(src), flags, esc); } /** * seq_escape - print string into buffer, escaping some characters * @m: target buffer * @s: NULL-terminated string * @esc: set of characters that need escaping * * Puts string into buffer, replacing each occurrence of character from * @esc with usual octal escape. * * Use seq_has_overflowed() to check for errors. */ static inline void seq_escape(struct seq_file *m, const char *s, const char *esc) { seq_escape_str(m, s, ESCAPE_OCTAL, esc); } void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type, int rowsize, int groupsize, const void *buf, size_t len, bool ascii); int seq_path(struct seq_file *, const struct path *, const char *); int seq_file_path(struct seq_file *, struct file *, const char *); int seq_dentry(struct seq_file *, struct dentry *, const char *); int seq_path_root(struct seq_file *m, const struct path *path, const struct path *root, const char *esc); void *single_start(struct seq_file *, loff_t *); int single_open(struct file *, int (*)(struct seq_file *, void *), void *); int single_open_size(struct file *, int (*)(struct seq_file *, void *), void *, size_t); int single_release(struct inode *, struct file *); void *__seq_open_private(struct file *, const struct seq_operations *, int); int seq_open_private(struct file *, const struct seq_operations *, int); int seq_release_private(struct inode *, struct file *); #ifdef CONFIG_BINARY_PRINTF __printf(2, 0) void seq_bprintf(struct seq_file *m, const char *f, const u32 *binary); #endif #define DEFINE_SEQ_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ int ret = seq_open(file, &__name ## _sops); \ if (!ret && inode->i_private) { \ struct seq_file *seq_f = file->private_data; \ seq_f->private = inode->i_private; \ } \ return ret; \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .llseek = seq_lseek, \ .release = seq_release, \ } #define DEFINE_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, inode->i_private); \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .llseek = seq_lseek, \ .release = single_release, \ } #define DEFINE_SHOW_STORE_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, inode->i_private); \ } \ \ static const struct file_operations __name ## _fops = { \ .owner = THIS_MODULE, \ .open = __name ## _open, \ .read = seq_read, \ .write = __name ## _write, \ .llseek = seq_lseek, \ .release = single_release, \ } #define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ static int __name ## _open(struct inode *inode, struct file *file) \ { \ return single_open(file, __name ## _show, pde_data(inode)); \ } \ \ static const struct proc_ops __name ## _proc_ops = { \ .proc_open = __name ## _open, \ .proc_read = seq_read, \ .proc_lseek = seq_lseek, \ .proc_release = single_release, \ } static inline struct user_namespace *seq_user_ns(struct seq_file *seq) { #ifdef CONFIG_USER_NS return seq->file->f_cred->user_ns; #else extern struct user_namespace init_user_ns; return &init_user_ns; #endif } /** * seq_show_options - display mount options with appropriate escapes. * @m: the seq_file handle * @name: the mount option name * @value: the mount option name's value, can be NULL */ static inline void seq_show_option(struct seq_file *m, const char *name, const char *value) { seq_putc(m, ','); seq_escape(m, name, ",= \t\n\\"); if (value) { seq_putc(m, '='); seq_escape(m, value, ", \t\n\\"); } } /** * seq_show_option_n - display mount options with appropriate escapes * where @value must be a specific length (i.e. * not NUL-terminated). * @m: the seq_file handle * @name: the mount option name * @value: the mount option name's value, cannot be NULL * @length: the exact length of @value to display, must be constant expression * * This is a macro since this uses "length" to define the size of the * stack buffer. */ #define seq_show_option_n(m, name, value, length) { \ char val_buf[length + 1]; \ memcpy(val_buf, value, length); \ val_buf[length] = '\0'; \ seq_show_option(m, name, val_buf); \ } #define SEQ_START_TOKEN ((void *)1) /* * Helpers for iteration over list_head-s in seq_files */ extern struct list_head *seq_list_start(struct list_head *head, loff_t pos); extern struct list_head *seq_list_start_head(struct list_head *head, loff_t pos); extern struct list_head *seq_list_next(void *v, struct list_head *head, loff_t *ppos); extern struct list_head *seq_list_start_rcu(struct list_head *head, loff_t pos); extern struct list_head *seq_list_start_head_rcu(struct list_head *head, loff_t pos); extern struct list_head *seq_list_next_rcu(void *v, struct list_head *head, loff_t *ppos); /* * Helpers for iteration over hlist_head-s in seq_files */ extern struct hlist_node *seq_hlist_start(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_start_head(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_next(void *v, struct hlist_head *head, loff_t *ppos); extern struct hlist_node *seq_hlist_start_rcu(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_start_head_rcu(struct hlist_head *head, loff_t pos); extern struct hlist_node *seq_hlist_next_rcu(void *v, struct hlist_head *head, loff_t *ppos); /* Helpers for iterating over per-cpu hlist_head-s in seq_files */ extern struct hlist_node *seq_hlist_start_percpu(struct hlist_head __percpu *head, int *cpu, loff_t pos); extern struct hlist_node *seq_hlist_next_percpu(void *v, struct hlist_head __percpu *head, int *cpu, loff_t *pos); void seq_file_init(void); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 /* SPDX-License-Identifier: GPL-2.0 */ /* AF_XDP internal functions * Copyright(c) 2018 Intel Corporation. */ #ifndef _LINUX_XDP_SOCK_H #define _LINUX_XDP_SOCK_H #include <linux/bpf.h> #include <linux/workqueue.h> #include <linux/if_xdp.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/mm.h> #include <net/sock.h> #define XDP_UMEM_SG_FLAG (1 << 1) struct net_device; struct xsk_queue; struct xdp_buff; struct xdp_umem { void *addrs; u64 size; u32 headroom; u32 chunk_size; u32 chunks; u32 npgs; struct user_struct *user; refcount_t users; u8 flags; u8 tx_metadata_len; bool zc; struct page **pgs; int id; struct list_head xsk_dma_list; struct work_struct work; }; struct xsk_map { struct bpf_map map; spinlock_t lock; /* Synchronize map updates */ atomic_t count; struct xdp_sock __rcu *xsk_map[]; }; struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ struct sock sk; struct xsk_queue *rx ____cacheline_aligned_in_smp; struct net_device *dev; struct xdp_umem *umem; struct list_head flush_node; struct xsk_buff_pool *pool; u16 queue_id; bool zc; bool sg; enum { XSK_READY = 0, XSK_BOUND, XSK_UNBOUND, } state; struct xsk_queue *tx ____cacheline_aligned_in_smp; struct list_head tx_list; /* record the number of tx descriptors sent by this xsk and * when it exceeds MAX_PER_SOCKET_BUDGET, an opportunity needs * to be given to other xsks for sending tx descriptors, thereby * preventing other XSKs from being starved. */ u32 tx_budget_spent; /* Statistics */ u64 rx_dropped; u64 rx_queue_full; /* When __xsk_generic_xmit() must return before it sees the EOP descriptor for the current * packet, the partially built skb is saved here so that packet building can resume in next * call of __xsk_generic_xmit(). */ struct sk_buff *skb; struct list_head map_list; /* Protects map_list */ spinlock_t map_list_lock; u32 max_tx_budget; /* Protects multiple processes in the control path */ struct mutex mutex; struct xsk_queue *fq_tmp; /* Only as tmp storage before bind */ struct xsk_queue *cq_tmp; /* Only as tmp storage before bind */ }; /* * AF_XDP TX metadata hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are * optional and can be filled with a null pointer. * * void (*tmo_request_timestamp)(void *priv) * Called when AF_XDP frame requested egress timestamp. * * u64 (*tmo_fill_timestamp)(void *priv) * Called when AF_XDP frame, that had requested egress timestamp, * received a completion. The hook needs to return the actual HW timestamp. * * void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv) * Called when AF_XDP frame requested HW checksum offload. csum_start * indicates position where checksumming should start. * csum_offset indicates position where checksum should be stored. * * void (*tmo_request_launch_time)(u64 launch_time, void *priv) * Called when AF_XDP frame requested launch time HW offload support. * launch_time indicates the PTP time at which the device can schedule the * packet for transmission. */ struct xsk_tx_metadata_ops { void (*tmo_request_timestamp)(void *priv); u64 (*tmo_fill_timestamp)(void *priv); void (*tmo_request_checksum)(u16 csum_start, u16 csum_offset, void *priv); void (*tmo_request_launch_time)(u64 launch_time, void *priv); }; #ifdef CONFIG_XDP_SOCKETS int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp); void __xsk_map_flush(struct list_head *flush_list); /** * xsk_tx_metadata_to_compl - Save enough relevant metadata information * to perform tx completion in the future. * @meta: pointer to AF_XDP metadata area * @compl: pointer to output struct xsk_tx_metadata_to_compl * * This function should be called by the networking device when * it prepares AF_XDP egress packet. The value of @compl should be stored * and passed to xsk_tx_metadata_complete upon TX completion. */ static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, struct xsk_tx_metadata_compl *compl) { if (!meta) return; if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) compl->tx_timestamp = &meta->completion.tx_timestamp; else compl->tx_timestamp = NULL; } /** * xsk_tx_metadata_request - Evaluate AF_XDP TX metadata at submission * and call appropriate xsk_tx_metadata_ops operation. * @meta: pointer to AF_XDP metadata area * @ops: pointer to struct xsk_tx_metadata_ops * @priv: pointer to driver-private aread * * This function should be called by the networking device when * it prepares AF_XDP egress packet. */ static inline void xsk_tx_metadata_request(const struct xsk_tx_metadata *meta, const struct xsk_tx_metadata_ops *ops, void *priv) { if (!meta) return; if (ops->tmo_request_launch_time) if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) ops->tmo_request_launch_time(meta->request.launch_time, priv); if (ops->tmo_request_timestamp) if (meta->flags & XDP_TXMD_FLAGS_TIMESTAMP) ops->tmo_request_timestamp(priv); if (ops->tmo_request_checksum) if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) ops->tmo_request_checksum(meta->request.csum_start, meta->request.csum_offset, priv); } /** * xsk_tx_metadata_complete - Evaluate AF_XDP TX metadata at completion * and call appropriate xsk_tx_metadata_ops operation. * @compl: pointer to completion metadata produced from xsk_tx_metadata_to_compl * @ops: pointer to struct xsk_tx_metadata_ops * @priv: pointer to driver-private aread * * This function should be called by the networking device upon * AF_XDP egress completion. */ static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, const struct xsk_tx_metadata_ops *ops, void *priv) { if (!compl) return; if (!compl->tx_timestamp) return; *compl->tx_timestamp = ops->tmo_fill_timestamp(priv); } #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { return -ENOTSUPP; } static inline int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) { return -EOPNOTSUPP; } static inline void __xsk_map_flush(struct list_head *flush_list) { } static inline void xsk_tx_metadata_to_compl(struct xsk_tx_metadata *meta, struct xsk_tx_metadata_compl *compl) { } static inline void xsk_tx_metadata_request(struct xsk_tx_metadata *meta, const struct xsk_tx_metadata_ops *ops, void *priv) { } static inline void xsk_tx_metadata_complete(struct xsk_tx_metadata_compl *compl, const struct xsk_tx_metadata_ops *ops, void *priv) { } #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */
7 7 4 4 25 5 62 62 2 2 2 2 2 62 3 3 3 45 62 31 1 103 33 43 45 3 61 1 6 6 2 4 2 3 25 19 1 1 6 19 2 1 2 111 111 12 12 12 99 99 61 61 61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2011 Intel Corporation. All rights reserved. * Copyright (C) 2014 Marvell International Ltd. */ #define pr_fmt(fmt) "llcp: %s: " fmt, __func__ #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/nfc.h> #include "nfc.h" #include "llcp.h" static u8 llcp_magic[3] = {0x46, 0x66, 0x6d}; static LIST_HEAD(llcp_devices); /* Protects llcp_devices list */ static DEFINE_SPINLOCK(llcp_devices_lock); static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb); void nfc_llcp_sock_link(struct llcp_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_add_node(sk, &l->head); write_unlock(&l->lock); } void nfc_llcp_sock_unlink(struct llcp_sock_list *l, struct sock *sk) { write_lock(&l->lock); sk_del_node_init(sk); write_unlock(&l->lock); } void nfc_llcp_socket_remote_param_init(struct nfc_llcp_sock *sock) { sock->remote_rw = LLCP_DEFAULT_RW; sock->remote_miu = LLCP_MAX_MIU + 1; } static void nfc_llcp_socket_purge(struct nfc_llcp_sock *sock) { struct nfc_llcp_local *local = sock->local; struct sk_buff *s, *tmp; skb_queue_purge(&sock->tx_queue); skb_queue_purge(&sock->tx_pending_queue); if (local == NULL) return; /* Search for local pending SKBs that are related to this socket */ skb_queue_walk_safe(&local->tx_queue, s, tmp) { if (s->sk != &sock->sk) continue; skb_unlink(s, &local->tx_queue); kfree_skb(s); } } static void nfc_llcp_socket_release(struct nfc_llcp_local *local, bool device, int err) { struct sock *sk; struct hlist_node *tmp; struct nfc_llcp_sock *llcp_sock; skb_queue_purge(&local->tx_queue); write_lock(&local->sockets.lock); sk_for_each_safe(sk, tmp, &local->sockets.head) { llcp_sock = nfc_llcp_sock(sk); bh_lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (sk->sk_state == LLCP_CONNECTED) nfc_put_device(llcp_sock->dev); if (sk->sk_state == LLCP_LISTEN) { struct nfc_llcp_sock *lsk, *n; struct sock *accept_sk; list_for_each_entry_safe(lsk, n, &llcp_sock->accept_queue, accept_queue) { accept_sk = &lsk->sk; bh_lock_sock(accept_sk); nfc_llcp_accept_unlink(accept_sk); if (err) accept_sk->sk_err = err; accept_sk->sk_state = LLCP_CLOSED; accept_sk->sk_state_change(sk); bh_unlock_sock(accept_sk); } } if (err) sk->sk_err = err; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); bh_unlock_sock(sk); sk_del_node_init(sk); } write_unlock(&local->sockets.lock); /* If we still have a device, we keep the RAW sockets alive */ if (device == true) return; write_lock(&local->raw_sockets.lock); sk_for_each_safe(sk, tmp, &local->raw_sockets.head) { llcp_sock = nfc_llcp_sock(sk); bh_lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (err) sk->sk_err = err; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); bh_unlock_sock(sk); sk_del_node_init(sk); } write_unlock(&local->raw_sockets.lock); } static struct nfc_llcp_local *nfc_llcp_local_get(struct nfc_llcp_local *local) { /* Since using nfc_llcp_local may result in usage of nfc_dev, whenever * we hold a reference to local, we also need to hold a reference to * the device to avoid UAF. */ if (!nfc_get_device(local->dev->idx)) return NULL; kref_get(&local->ref); return local; } static void local_cleanup(struct nfc_llcp_local *local) { nfc_llcp_socket_release(local, false, ENXIO); timer_delete_sync(&local->link_timer); skb_queue_purge(&local->tx_queue); cancel_work_sync(&local->tx_work); cancel_work_sync(&local->rx_work); cancel_work_sync(&local->timeout_work); kfree_skb(local->rx_pending); local->rx_pending = NULL; timer_delete_sync(&local->sdreq_timer); cancel_work_sync(&local->sdreq_timeout_work); nfc_llcp_free_sdp_tlv_list(&local->pending_sdreqs); } static void local_release(struct kref *ref) { struct nfc_llcp_local *local; local = container_of(ref, struct nfc_llcp_local, ref); local_cleanup(local); kfree(local); } int nfc_llcp_local_put(struct nfc_llcp_local *local) { struct nfc_dev *dev; int ret; if (local == NULL) return 0; dev = local->dev; ret = kref_put(&local->ref, local_release); nfc_put_device(dev); return ret; } static struct nfc_llcp_sock *nfc_llcp_sock_get(struct nfc_llcp_local *local, u8 ssap, u8 dsap) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; pr_debug("ssap dsap %d %d\n", ssap, dsap); if (ssap == 0 && dsap == 0) return NULL; read_lock(&local->sockets.lock); llcp_sock = NULL; sk_for_each(sk, &local->sockets.head) { tmp_sock = nfc_llcp_sock(sk); if (tmp_sock->ssap == ssap && tmp_sock->dsap == dsap) { llcp_sock = tmp_sock; sock_hold(&llcp_sock->sk); break; } } read_unlock(&local->sockets.lock); return llcp_sock; } static void nfc_llcp_sock_put(struct nfc_llcp_sock *sock) { sock_put(&sock->sk); } static void nfc_llcp_timeout_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, timeout_work); nfc_dep_link_down(local->dev); } static void nfc_llcp_symm_timer(struct timer_list *t) { struct nfc_llcp_local *local = timer_container_of(local, t, link_timer); pr_err("SYMM timeout\n"); schedule_work(&local->timeout_work); } static void nfc_llcp_sdreq_timeout_work(struct work_struct *work) { unsigned long time; HLIST_HEAD(nl_sdres_list); struct hlist_node *n; struct nfc_llcp_sdp_tlv *sdp; struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, sdreq_timeout_work); mutex_lock(&local->sdreq_lock); time = jiffies - msecs_to_jiffies(3 * local->remote_lto); hlist_for_each_entry_safe(sdp, n, &local->pending_sdreqs, node) { if (time_after(sdp->time, time)) continue; sdp->sap = LLCP_SDP_UNBOUND; hlist_del(&sdp->node); hlist_add_head(&sdp->node, &nl_sdres_list); } if (!hlist_empty(&local->pending_sdreqs)) mod_timer(&local->sdreq_timer, jiffies + msecs_to_jiffies(3 * local->remote_lto)); mutex_unlock(&local->sdreq_lock); if (!hlist_empty(&nl_sdres_list)) nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); } static void nfc_llcp_sdreq_timer(struct timer_list *t) { struct nfc_llcp_local *local = timer_container_of(local, t, sdreq_timer); schedule_work(&local->sdreq_timeout_work); } struct nfc_llcp_local *nfc_llcp_find_local(struct nfc_dev *dev) { struct nfc_llcp_local *local; struct nfc_llcp_local *res = NULL; spin_lock(&llcp_devices_lock); list_for_each_entry(local, &llcp_devices, list) if (local->dev == dev) { res = nfc_llcp_local_get(local); break; } spin_unlock(&llcp_devices_lock); return res; } static struct nfc_llcp_local *nfc_llcp_remove_local(struct nfc_dev *dev) { struct nfc_llcp_local *local, *tmp; spin_lock(&llcp_devices_lock); list_for_each_entry_safe(local, tmp, &llcp_devices, list) if (local->dev == dev) { list_del(&local->list); spin_unlock(&llcp_devices_lock); return local; } spin_unlock(&llcp_devices_lock); pr_warn("Shutting down device not found\n"); return NULL; } static char *wks[] = { NULL, NULL, /* SDP */ "urn:nfc:sn:ip", "urn:nfc:sn:obex", "urn:nfc:sn:snep", }; static int nfc_llcp_wks_sap(const char *service_name, size_t service_name_len) { int sap, num_wks; pr_debug("%s\n", service_name); if (service_name == NULL) return -EINVAL; num_wks = ARRAY_SIZE(wks); for (sap = 0; sap < num_wks; sap++) { if (wks[sap] == NULL) continue; if (strncmp(wks[sap], service_name, service_name_len) == 0) return sap; } return -EINVAL; } static struct nfc_llcp_sock *nfc_llcp_sock_from_sn(struct nfc_llcp_local *local, const u8 *sn, size_t sn_len, bool needref) { struct sock *sk; struct nfc_llcp_sock *llcp_sock, *tmp_sock; pr_debug("sn %zd %p\n", sn_len, sn); if (sn == NULL || sn_len == 0) return NULL; read_lock(&local->sockets.lock); llcp_sock = NULL; sk_for_each(sk, &local->sockets.head) { tmp_sock = nfc_llcp_sock(sk); pr_debug("llcp sock %p\n", tmp_sock); if (tmp_sock->sk.sk_type == SOCK_STREAM && tmp_sock->sk.sk_state != LLCP_LISTEN) continue; if (tmp_sock->sk.sk_type == SOCK_DGRAM && tmp_sock->sk.sk_state != LLCP_BOUND) continue; if (tmp_sock->service_name == NULL || tmp_sock->service_name_len == 0) continue; if (tmp_sock->service_name_len != sn_len) continue; if (memcmp(sn, tmp_sock->service_name, sn_len) == 0) { llcp_sock = tmp_sock; if (needref) sock_hold(&llcp_sock->sk); break; } } read_unlock(&local->sockets.lock); pr_debug("Found llcp sock %p\n", llcp_sock); return llcp_sock; } u8 nfc_llcp_get_sdp_ssap(struct nfc_llcp_local *local, struct nfc_llcp_sock *sock) { mutex_lock(&local->sdp_lock); if (sock->service_name != NULL && sock->service_name_len > 0) { int ssap = nfc_llcp_wks_sap(sock->service_name, sock->service_name_len); if (ssap > 0) { pr_debug("WKS %d\n", ssap); /* This is a WKS, let's check if it's free */ if (test_bit(ssap, &local->local_wks)) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } set_bit(ssap, &local->local_wks); mutex_unlock(&local->sdp_lock); return ssap; } /* * Check if there already is a non WKS socket bound * to this service name. */ if (nfc_llcp_sock_from_sn(local, sock->service_name, sock->service_name_len, false) != NULL) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } mutex_unlock(&local->sdp_lock); return LLCP_SDP_UNBOUND; } else if (sock->ssap != 0 && sock->ssap < LLCP_WKS_NUM_SAP) { if (!test_bit(sock->ssap, &local->local_wks)) { set_bit(sock->ssap, &local->local_wks); mutex_unlock(&local->sdp_lock); return sock->ssap; } } mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } u8 nfc_llcp_get_local_ssap(struct nfc_llcp_local *local) { u8 local_ssap; mutex_lock(&local->sdp_lock); local_ssap = find_first_zero_bit(&local->local_sap, LLCP_LOCAL_NUM_SAP); if (local_ssap == LLCP_LOCAL_NUM_SAP) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } set_bit(local_ssap, &local->local_sap); mutex_unlock(&local->sdp_lock); return local_ssap + LLCP_LOCAL_SAP_OFFSET; } void nfc_llcp_put_ssap(struct nfc_llcp_local *local, u8 ssap) { u8 local_ssap; unsigned long *sdp; if (ssap < LLCP_WKS_NUM_SAP) { local_ssap = ssap; sdp = &local->local_wks; } else if (ssap < LLCP_LOCAL_NUM_SAP) { atomic_t *client_cnt; local_ssap = ssap - LLCP_WKS_NUM_SAP; sdp = &local->local_sdp; client_cnt = &local->local_sdp_cnt[local_ssap]; pr_debug("%d clients\n", atomic_read(client_cnt)); mutex_lock(&local->sdp_lock); if (atomic_dec_and_test(client_cnt)) { struct nfc_llcp_sock *l_sock; pr_debug("No more clients for SAP %d\n", ssap); clear_bit(local_ssap, sdp); /* Find the listening sock and set it back to UNBOUND */ l_sock = nfc_llcp_sock_get(local, ssap, LLCP_SAP_SDP); if (l_sock) { l_sock->ssap = LLCP_SDP_UNBOUND; nfc_llcp_sock_put(l_sock); } } mutex_unlock(&local->sdp_lock); return; } else if (ssap < LLCP_MAX_SAP) { local_ssap = ssap - LLCP_LOCAL_NUM_SAP; sdp = &local->local_sap; } else { return; } mutex_lock(&local->sdp_lock); clear_bit(local_ssap, sdp); mutex_unlock(&local->sdp_lock); } static u8 nfc_llcp_reserve_sdp_ssap(struct nfc_llcp_local *local) { u8 ssap; mutex_lock(&local->sdp_lock); ssap = find_first_zero_bit(&local->local_sdp, LLCP_SDP_NUM_SAP); if (ssap == LLCP_SDP_NUM_SAP) { mutex_unlock(&local->sdp_lock); return LLCP_SAP_MAX; } pr_debug("SDP ssap %d\n", LLCP_WKS_NUM_SAP + ssap); set_bit(ssap, &local->local_sdp); mutex_unlock(&local->sdp_lock); return LLCP_WKS_NUM_SAP + ssap; } static int nfc_llcp_build_gb(struct nfc_llcp_local *local) { u8 *gb_cur, version, version_length; u8 lto_length, wks_length, miux_length; const u8 *version_tlv = NULL, *lto_tlv = NULL, *wks_tlv = NULL, *miux_tlv = NULL; __be16 wks = cpu_to_be16(local->local_wks); u8 gb_len = 0; int ret = 0; version = LLCP_VERSION_11; version_tlv = nfc_llcp_build_tlv(LLCP_TLV_VERSION, &version, 1, &version_length); if (!version_tlv) { ret = -ENOMEM; goto out; } gb_len += version_length; lto_tlv = nfc_llcp_build_tlv(LLCP_TLV_LTO, &local->lto, 1, &lto_length); if (!lto_tlv) { ret = -ENOMEM; goto out; } gb_len += lto_length; pr_debug("Local wks 0x%lx\n", local->local_wks); wks_tlv = nfc_llcp_build_tlv(LLCP_TLV_WKS, (u8 *)&wks, 2, &wks_length); if (!wks_tlv) { ret = -ENOMEM; goto out; } gb_len += wks_length; miux_tlv = nfc_llcp_build_tlv(LLCP_TLV_MIUX, (u8 *)&local->miux, 0, &miux_length); if (!miux_tlv) { ret = -ENOMEM; goto out; } gb_len += miux_length; gb_len += ARRAY_SIZE(llcp_magic); if (gb_len > NFC_MAX_GT_LEN) { ret = -EINVAL; goto out; } gb_cur = local->gb; memcpy(gb_cur, llcp_magic, ARRAY_SIZE(llcp_magic)); gb_cur += ARRAY_SIZE(llcp_magic); memcpy(gb_cur, version_tlv, version_length); gb_cur += version_length; memcpy(gb_cur, lto_tlv, lto_length); gb_cur += lto_length; memcpy(gb_cur, wks_tlv, wks_length); gb_cur += wks_length; memcpy(gb_cur, miux_tlv, miux_length); gb_cur += miux_length; local->gb_len = gb_len; out: kfree(version_tlv); kfree(lto_tlv); kfree(wks_tlv); kfree(miux_tlv); return ret; } u8 *nfc_llcp_general_bytes(struct nfc_dev *dev, size_t *general_bytes_len) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) { *general_bytes_len = 0; return NULL; } nfc_llcp_build_gb(local); *general_bytes_len = local->gb_len; nfc_llcp_local_put(local); return local->gb; } int nfc_llcp_set_remote_gb(struct nfc_dev *dev, const u8 *gb, u8 gb_len) { struct nfc_llcp_local *local; int err; if (gb_len < 3 || gb_len > NFC_MAX_GT_LEN) return -EINVAL; local = nfc_llcp_find_local(dev); if (local == NULL) { pr_err("No LLCP device\n"); return -ENODEV; } memset(local->remote_gb, 0, NFC_MAX_GT_LEN); memcpy(local->remote_gb, gb, gb_len); local->remote_gb_len = gb_len; if (memcmp(local->remote_gb, llcp_magic, 3)) { pr_err("MAC does not support LLCP\n"); err = -EINVAL; goto out; } err = nfc_llcp_parse_gb_tlv(local, &local->remote_gb[3], local->remote_gb_len - 3); out: nfc_llcp_local_put(local); return err; } static u8 nfc_llcp_dsap(const struct sk_buff *pdu) { return (pdu->data[0] & 0xfc) >> 2; } static u8 nfc_llcp_ptype(const struct sk_buff *pdu) { return ((pdu->data[0] & 0x03) << 2) | ((pdu->data[1] & 0xc0) >> 6); } static u8 nfc_llcp_ssap(const struct sk_buff *pdu) { return pdu->data[1] & 0x3f; } static u8 nfc_llcp_ns(const struct sk_buff *pdu) { return pdu->data[2] >> 4; } static u8 nfc_llcp_nr(const struct sk_buff *pdu) { return pdu->data[2] & 0xf; } static void nfc_llcp_set_nrns(struct nfc_llcp_sock *sock, struct sk_buff *pdu) { pdu->data[2] = (sock->send_n << 4) | (sock->recv_n); sock->send_n = (sock->send_n + 1) % 16; sock->recv_ack_n = (sock->recv_n - 1) % 16; } void nfc_llcp_send_to_raw_sock(struct nfc_llcp_local *local, struct sk_buff *skb, u8 direction) { struct sk_buff *skb_copy = NULL, *nskb; struct sock *sk; u8 *data; read_lock(&local->raw_sockets.lock); sk_for_each(sk, &local->raw_sockets.head) { if (sk->sk_state != LLCP_BOUND) continue; if (skb_copy == NULL) { skb_copy = __pskb_copy_fclone(skb, NFC_RAW_HEADER_SIZE, GFP_ATOMIC, true); if (skb_copy == NULL) continue; data = skb_push(skb_copy, NFC_RAW_HEADER_SIZE); data[0] = local->dev ? local->dev->idx : 0xFF; data[1] = direction & 0x01; data[1] |= (RAW_PAYLOAD_LLCP << 1); } nskb = skb_clone(skb_copy, GFP_ATOMIC); if (!nskb) continue; if (sock_queue_rcv_skb(sk, nskb)) kfree_skb(nskb); } read_unlock(&local->raw_sockets.lock); kfree_skb(skb_copy); } static void nfc_llcp_tx_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, tx_work); struct sk_buff *skb; struct sock *sk; struct nfc_llcp_sock *llcp_sock; skb = skb_dequeue(&local->tx_queue); if (skb != NULL) { sk = skb->sk; llcp_sock = nfc_llcp_sock(sk); if (llcp_sock == NULL && nfc_llcp_ptype(skb) == LLCP_PDU_I) { kfree_skb(skb); nfc_llcp_send_symm(local->dev); } else if (llcp_sock && !llcp_sock->remote_ready) { skb_queue_head(&local->tx_queue, skb); nfc_llcp_send_symm(local->dev); } else { struct sk_buff *copy_skb = NULL; u8 ptype = nfc_llcp_ptype(skb); int ret; pr_debug("Sending pending skb\n"); print_hex_dump_debug("LLCP Tx: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, skb->len, true); if (ptype == LLCP_PDU_I) copy_skb = skb_copy(skb, GFP_ATOMIC); __net_timestamp(skb); nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_TX); ret = nfc_data_exchange(local->dev, local->target_idx, skb, nfc_llcp_recv, local); if (ret) { kfree_skb(copy_skb); goto out; } if (ptype == LLCP_PDU_I && copy_skb) skb_queue_tail(&llcp_sock->tx_pending_queue, copy_skb); } } else { nfc_llcp_send_symm(local->dev); } out: mod_timer(&local->link_timer, jiffies + msecs_to_jiffies(2 * local->remote_lto)); } static struct nfc_llcp_sock *nfc_llcp_connecting_sock_get(struct nfc_llcp_local *local, u8 ssap) { struct sock *sk; struct nfc_llcp_sock *llcp_sock; read_lock(&local->connecting_sockets.lock); sk_for_each(sk, &local->connecting_sockets.head) { llcp_sock = nfc_llcp_sock(sk); if (llcp_sock->ssap == ssap) { sock_hold(&llcp_sock->sk); goto out; } } llcp_sock = NULL; out: read_unlock(&local->connecting_sockets.lock); return llcp_sock; } static struct nfc_llcp_sock *nfc_llcp_sock_get_sn(struct nfc_llcp_local *local, const u8 *sn, size_t sn_len) { return nfc_llcp_sock_from_sn(local, sn, sn_len, true); } static const u8 *nfc_llcp_connect_sn(const struct sk_buff *skb, size_t *sn_len) { u8 type, length; const u8 *tlv = &skb->data[2]; size_t tlv_array_len = skb->len - LLCP_HEADER_SIZE, offset = 0; while (offset < tlv_array_len) { type = tlv[0]; length = tlv[1]; pr_debug("type 0x%x length %d\n", type, length); if (type == LLCP_TLV_SN) { *sn_len = length; return &tlv[2]; } offset += length + 2; tlv += length + 2; } return NULL; } static void nfc_llcp_recv_ui(struct nfc_llcp_local *local, struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct nfc_llcp_ui_cb *ui_cb; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); ui_cb = nfc_llcp_ui_skb_cb(skb); ui_cb->dsap = dsap; ui_cb->ssap = ssap; pr_debug("%d %d\n", dsap, ssap); /* We're looking for a bound socket, not a client one */ llcp_sock = nfc_llcp_sock_get(local, dsap, LLCP_SAP_SDP); if (llcp_sock == NULL || llcp_sock->sk.sk_type != SOCK_DGRAM) return; /* There is no sequence with UI frames */ skb_pull(skb, LLCP_HEADER_SIZE); if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) { /* * UI frames will be freed from the socket layer, so we * need to keep them alive until someone receives them. */ skb_get(skb); } else { pr_err("Receive queue is full\n"); } nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_connect(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct sock *new_sk, *parent; struct nfc_llcp_sock *sock, *new_sock; u8 dsap, ssap, reason; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("%d %d\n", dsap, ssap); if (dsap != LLCP_SAP_SDP) { sock = nfc_llcp_sock_get(local, dsap, LLCP_SAP_SDP); if (sock == NULL || sock->sk.sk_state != LLCP_LISTEN) { reason = LLCP_DM_NOBOUND; goto fail; } } else { const u8 *sn; size_t sn_len; sn = nfc_llcp_connect_sn(skb, &sn_len); if (sn == NULL) { reason = LLCP_DM_NOBOUND; goto fail; } pr_debug("Service name length %zu\n", sn_len); sock = nfc_llcp_sock_get_sn(local, sn, sn_len); if (sock == NULL) { reason = LLCP_DM_NOBOUND; goto fail; } } lock_sock(&sock->sk); parent = &sock->sk; if (sk_acceptq_is_full(parent)) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } if (sock->ssap == LLCP_SDP_UNBOUND) { u8 ssap = nfc_llcp_reserve_sdp_ssap(local); pr_debug("First client, reserving %d\n", ssap); if (ssap == LLCP_SAP_MAX) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } sock->ssap = ssap; } new_sk = nfc_llcp_sock_alloc(NULL, parent->sk_type, GFP_ATOMIC, 0); if (new_sk == NULL) { reason = LLCP_DM_REJ; release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } new_sock = nfc_llcp_sock(new_sk); new_sock->local = nfc_llcp_local_get(local); if (!new_sock->local) { reason = LLCP_DM_REJ; sock_put(&new_sock->sk); release_sock(&sock->sk); sock_put(&sock->sk); goto fail; } new_sock->dev = local->dev; new_sock->rw = sock->rw; new_sock->miux = sock->miux; new_sock->nfc_protocol = sock->nfc_protocol; new_sock->dsap = ssap; new_sock->target_idx = local->target_idx; new_sock->parent = parent; new_sock->ssap = sock->ssap; if (sock->ssap < LLCP_LOCAL_NUM_SAP && sock->ssap >= LLCP_WKS_NUM_SAP) { atomic_t *client_count; pr_debug("reserved_ssap %d for %p\n", sock->ssap, new_sock); client_count = &local->local_sdp_cnt[sock->ssap - LLCP_WKS_NUM_SAP]; atomic_inc(client_count); new_sock->reserved_ssap = sock->ssap; } nfc_llcp_parse_connection_tlv(new_sock, &skb->data[LLCP_HEADER_SIZE], skb->len - LLCP_HEADER_SIZE); pr_debug("new sock %p sk %p\n", new_sock, &new_sock->sk); nfc_llcp_sock_link(&local->sockets, new_sk); nfc_llcp_accept_enqueue(&sock->sk, new_sk); nfc_get_device(local->dev->idx); new_sk->sk_state = LLCP_CONNECTED; /* Wake the listening processes */ parent->sk_data_ready(parent); /* Send CC */ nfc_llcp_send_cc(new_sock); release_sock(&sock->sk); sock_put(&sock->sk); return; fail: /* Send DM */ nfc_llcp_send_dm(local, dsap, ssap, reason); } int nfc_llcp_queue_i_frames(struct nfc_llcp_sock *sock) { int nr_frames = 0; struct nfc_llcp_local *local = sock->local; pr_debug("Remote ready %d tx queue len %d remote rw %d", sock->remote_ready, skb_queue_len(&sock->tx_pending_queue), sock->remote_rw); /* Try to queue some I frames for transmission */ while (sock->remote_ready && skb_queue_len(&sock->tx_pending_queue) < sock->remote_rw) { struct sk_buff *pdu; pdu = skb_dequeue(&sock->tx_queue); if (pdu == NULL) break; /* Update N(S)/N(R) */ nfc_llcp_set_nrns(sock, pdu); skb_queue_tail(&local->tx_queue, pdu); nr_frames++; } return nr_frames; } static void nfc_llcp_recv_hdlc(struct nfc_llcp_local *local, struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap, ptype, ns, nr; ptype = nfc_llcp_ptype(skb); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); ns = nfc_llcp_ns(skb); nr = nfc_llcp_nr(skb); pr_debug("%d %d R %d S %d\n", dsap, ssap, nr, ns); llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); if (llcp_sock == NULL) { nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; lock_sock(sk); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_sock_put(llcp_sock); } /* Pass the payload upstream */ if (ptype == LLCP_PDU_I) { pr_debug("I frame, queueing on %p\n", &llcp_sock->sk); if (ns == llcp_sock->recv_n) llcp_sock->recv_n = (llcp_sock->recv_n + 1) % 16; else pr_err("Received out of sequence I PDU\n"); skb_pull(skb, LLCP_HEADER_SIZE + LLCP_SEQUENCE_SIZE); if (!sock_queue_rcv_skb(&llcp_sock->sk, skb)) { /* * I frames will be freed from the socket layer, so we * need to keep them alive until someone receives them. */ skb_get(skb); } else { pr_err("Receive queue is full\n"); } } /* Remove skbs from the pending queue */ if (llcp_sock->send_ack_n != nr) { struct sk_buff *s, *tmp; u8 n; llcp_sock->send_ack_n = nr; /* Remove and free all skbs until ns == nr */ skb_queue_walk_safe(&llcp_sock->tx_pending_queue, s, tmp) { n = nfc_llcp_ns(s); skb_unlink(s, &llcp_sock->tx_pending_queue); kfree_skb(s); if (n == nr) break; } /* Re-queue the remaining skbs for transmission */ skb_queue_reverse_walk_safe(&llcp_sock->tx_pending_queue, s, tmp) { skb_unlink(s, &llcp_sock->tx_pending_queue); skb_queue_head(&local->tx_queue, s); } } if (ptype == LLCP_PDU_RR) llcp_sock->remote_ready = true; else if (ptype == LLCP_PDU_RNR) llcp_sock->remote_ready = false; if (nfc_llcp_queue_i_frames(llcp_sock) == 0 && ptype == LLCP_PDU_I) nfc_llcp_send_rr(llcp_sock); release_sock(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_disc(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); if ((dsap == 0) && (ssap == 0)) { pr_debug("Connection termination"); nfc_dep_link_down(local->dev); return; } llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); if (llcp_sock == NULL) { nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; lock_sock(sk); nfc_llcp_socket_purge(llcp_sock); if (sk->sk_state == LLCP_CLOSED) { release_sock(sk); nfc_llcp_sock_put(llcp_sock); } if (sk->sk_state == LLCP_CONNECTED) { nfc_put_device(local->dev); sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); } nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_DISC); release_sock(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_cc(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); llcp_sock = nfc_llcp_connecting_sock_get(local, dsap); if (llcp_sock == NULL) { pr_err("Invalid CC\n"); nfc_llcp_send_dm(local, dsap, ssap, LLCP_DM_NOCONN); return; } sk = &llcp_sock->sk; /* Unlink from connecting and link to the client array */ nfc_llcp_sock_unlink(&local->connecting_sockets, sk); nfc_llcp_sock_link(&local->sockets, sk); llcp_sock->dsap = ssap; nfc_llcp_parse_connection_tlv(llcp_sock, &skb->data[LLCP_HEADER_SIZE], skb->len - LLCP_HEADER_SIZE); sk->sk_state = LLCP_CONNECTED; sk->sk_state_change(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_dm(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; struct sock *sk; u8 dsap, ssap, reason; dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); reason = skb->data[2]; pr_debug("%d %d reason %d\n", ssap, dsap, reason); switch (reason) { case LLCP_DM_NOBOUND: case LLCP_DM_REJ: llcp_sock = nfc_llcp_connecting_sock_get(local, dsap); break; default: llcp_sock = nfc_llcp_sock_get(local, dsap, ssap); break; } if (llcp_sock == NULL) { pr_debug("Already closed\n"); return; } sk = &llcp_sock->sk; sk->sk_err = ENXIO; sk->sk_state = LLCP_CLOSED; sk->sk_state_change(sk); nfc_llcp_sock_put(llcp_sock); } static void nfc_llcp_recv_snl(struct nfc_llcp_local *local, const struct sk_buff *skb) { struct nfc_llcp_sock *llcp_sock; u8 dsap, ssap, type, length, tid, sap; const u8 *tlv; u16 tlv_len, offset; const char *service_name; size_t service_name_len; struct nfc_llcp_sdp_tlv *sdp; HLIST_HEAD(llc_sdres_list); size_t sdres_tlvs_len; HLIST_HEAD(nl_sdres_list); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("%d %d\n", dsap, ssap); if (dsap != LLCP_SAP_SDP || ssap != LLCP_SAP_SDP) { pr_err("Wrong SNL SAP\n"); return; } tlv = &skb->data[LLCP_HEADER_SIZE]; tlv_len = skb->len - LLCP_HEADER_SIZE; offset = 0; sdres_tlvs_len = 0; while (offset < tlv_len) { type = tlv[0]; length = tlv[1]; switch (type) { case LLCP_TLV_SDREQ: tid = tlv[2]; service_name = (char *) &tlv[3]; service_name_len = length - 1; pr_debug("Looking for %.16s\n", service_name); if (service_name_len == strlen("urn:nfc:sn:sdp") && !strncmp(service_name, "urn:nfc:sn:sdp", service_name_len)) { sap = 1; goto add_snl; } llcp_sock = nfc_llcp_sock_from_sn(local, service_name, service_name_len, true); if (!llcp_sock) { sap = 0; goto add_snl; } /* * We found a socket but its ssap has not been reserved * yet. We need to assign it for good and send a reply. * The ssap will be freed when the socket is closed. */ if (llcp_sock->ssap == LLCP_SDP_UNBOUND) { atomic_t *client_count; sap = nfc_llcp_reserve_sdp_ssap(local); pr_debug("Reserving %d\n", sap); if (sap == LLCP_SAP_MAX) { sap = 0; nfc_llcp_sock_put(llcp_sock); goto add_snl; } client_count = &local->local_sdp_cnt[sap - LLCP_WKS_NUM_SAP]; atomic_inc(client_count); llcp_sock->ssap = sap; llcp_sock->reserved_ssap = sap; } else { sap = llcp_sock->ssap; } pr_debug("%p %d\n", llcp_sock, sap); nfc_llcp_sock_put(llcp_sock); add_snl: sdp = nfc_llcp_build_sdres_tlv(tid, sap); if (sdp == NULL) goto exit; sdres_tlvs_len += sdp->tlv_len; hlist_add_head(&sdp->node, &llc_sdres_list); break; case LLCP_TLV_SDRES: mutex_lock(&local->sdreq_lock); pr_debug("LLCP_TLV_SDRES: searching tid %d\n", tlv[2]); hlist_for_each_entry(sdp, &local->pending_sdreqs, node) { if (sdp->tid != tlv[2]) continue; sdp->sap = tlv[3]; pr_debug("Found: uri=%s, sap=%d\n", sdp->uri, sdp->sap); hlist_del(&sdp->node); hlist_add_head(&sdp->node, &nl_sdres_list); break; } mutex_unlock(&local->sdreq_lock); break; default: pr_err("Invalid SNL tlv value 0x%x\n", type); break; } offset += length + 2; tlv += length + 2; } exit: if (!hlist_empty(&nl_sdres_list)) nfc_genl_llc_send_sdres(local->dev, &nl_sdres_list); if (!hlist_empty(&llc_sdres_list)) nfc_llcp_send_snl_sdres(local, &llc_sdres_list, sdres_tlvs_len); } static void nfc_llcp_recv_agf(struct nfc_llcp_local *local, struct sk_buff *skb) { u8 ptype; u16 pdu_len; struct sk_buff *new_skb; if (skb->len <= LLCP_HEADER_SIZE) { pr_err("Malformed AGF PDU\n"); return; } skb_pull(skb, LLCP_HEADER_SIZE); while (skb->len > LLCP_AGF_PDU_HEADER_SIZE) { pdu_len = skb->data[0] << 8 | skb->data[1]; skb_pull(skb, LLCP_AGF_PDU_HEADER_SIZE); if (pdu_len < LLCP_HEADER_SIZE || pdu_len > skb->len) { pr_err("Malformed AGF PDU\n"); return; } ptype = nfc_llcp_ptype(skb); if (ptype == LLCP_PDU_SYMM || ptype == LLCP_PDU_AGF) goto next; new_skb = nfc_alloc_recv_skb(pdu_len, GFP_KERNEL); if (new_skb == NULL) { pr_err("Could not allocate PDU\n"); return; } skb_put_data(new_skb, skb->data, pdu_len); nfc_llcp_rx_skb(local, new_skb); kfree_skb(new_skb); next: skb_pull(skb, pdu_len); } } static void nfc_llcp_rx_skb(struct nfc_llcp_local *local, struct sk_buff *skb) { u8 dsap, ssap, ptype; ptype = nfc_llcp_ptype(skb); dsap = nfc_llcp_dsap(skb); ssap = nfc_llcp_ssap(skb); pr_debug("ptype 0x%x dsap 0x%x ssap 0x%x\n", ptype, dsap, ssap); if (ptype != LLCP_PDU_SYMM) print_hex_dump_debug("LLCP Rx: ", DUMP_PREFIX_OFFSET, 16, 1, skb->data, skb->len, true); switch (ptype) { case LLCP_PDU_SYMM: pr_debug("SYMM\n"); break; case LLCP_PDU_UI: pr_debug("UI\n"); nfc_llcp_recv_ui(local, skb); break; case LLCP_PDU_CONNECT: pr_debug("CONNECT\n"); nfc_llcp_recv_connect(local, skb); break; case LLCP_PDU_DISC: pr_debug("DISC\n"); nfc_llcp_recv_disc(local, skb); break; case LLCP_PDU_CC: pr_debug("CC\n"); nfc_llcp_recv_cc(local, skb); break; case LLCP_PDU_DM: pr_debug("DM\n"); nfc_llcp_recv_dm(local, skb); break; case LLCP_PDU_SNL: pr_debug("SNL\n"); nfc_llcp_recv_snl(local, skb); break; case LLCP_PDU_I: case LLCP_PDU_RR: case LLCP_PDU_RNR: pr_debug("I frame\n"); nfc_llcp_recv_hdlc(local, skb); break; case LLCP_PDU_AGF: pr_debug("AGF frame\n"); nfc_llcp_recv_agf(local, skb); break; } } static void nfc_llcp_rx_work(struct work_struct *work) { struct nfc_llcp_local *local = container_of(work, struct nfc_llcp_local, rx_work); struct sk_buff *skb; skb = local->rx_pending; if (skb == NULL) { pr_debug("No pending SKB\n"); return; } __net_timestamp(skb); nfc_llcp_send_to_raw_sock(local, skb, NFC_DIRECTION_RX); nfc_llcp_rx_skb(local, skb); schedule_work(&local->tx_work); kfree_skb(local->rx_pending); local->rx_pending = NULL; } static void __nfc_llcp_recv(struct nfc_llcp_local *local, struct sk_buff *skb) { local->rx_pending = skb; timer_delete(&local->link_timer); schedule_work(&local->rx_work); } void nfc_llcp_recv(void *data, struct sk_buff *skb, int err) { struct nfc_llcp_local *local = (struct nfc_llcp_local *) data; if (err < 0) { pr_err("LLCP PDU receive err %d\n", err); return; } __nfc_llcp_recv(local, skb); } int nfc_llcp_data_received(struct nfc_dev *dev, struct sk_buff *skb) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) { kfree_skb(skb); return -ENODEV; } __nfc_llcp_recv(local, skb); nfc_llcp_local_put(local); return 0; } void nfc_llcp_mac_is_down(struct nfc_dev *dev) { struct nfc_llcp_local *local; local = nfc_llcp_find_local(dev); if (local == NULL) return; local->remote_miu = LLCP_DEFAULT_MIU; local->remote_lto = LLCP_DEFAULT_LTO; /* Close and purge all existing sockets */ nfc_llcp_socket_release(local, true, 0); nfc_llcp_local_put(local); } void nfc_llcp_mac_is_up(struct nfc_dev *dev, u32 target_idx, u8 comm_mode, u8 rf_mode) { struct nfc_llcp_local *local; pr_debug("rf mode %d\n", rf_mode); local = nfc_llcp_find_local(dev); if (local == NULL) return; local->target_idx = target_idx; local->comm_mode = comm_mode; local->rf_mode = rf_mode; if (rf_mode == NFC_RF_INITIATOR) { pr_debug("Queueing Tx work\n"); schedule_work(&local->tx_work); } else { mod_timer(&local->link_timer, jiffies + msecs_to_jiffies(local->remote_lto)); } nfc_llcp_local_put(local); } int nfc_llcp_register_device(struct nfc_dev *ndev) { struct nfc_llcp_local *local; local = kzalloc(sizeof(struct nfc_llcp_local), GFP_KERNEL); if (local == NULL) return -ENOMEM; /* As we are going to initialize local's refcount, we need to get the * nfc_dev to avoid UAF, otherwise there is no point in continuing. * See nfc_llcp_local_get(). */ local->dev = nfc_get_device(ndev->idx); if (!local->dev) { kfree(local); return -ENODEV; } INIT_LIST_HEAD(&local->list); kref_init(&local->ref); mutex_init(&local->sdp_lock); timer_setup(&local->link_timer, nfc_llcp_symm_timer, 0); skb_queue_head_init(&local->tx_queue); INIT_WORK(&local->tx_work, nfc_llcp_tx_work); local->rx_pending = NULL; INIT_WORK(&local->rx_work, nfc_llcp_rx_work); INIT_WORK(&local->timeout_work, nfc_llcp_timeout_work); rwlock_init(&local->sockets.lock); rwlock_init(&local->connecting_sockets.lock); rwlock_init(&local->raw_sockets.lock); local->lto = 150; /* 1500 ms */ local->rw = LLCP_MAX_RW; local->miux = cpu_to_be16(LLCP_MAX_MIUX); local->local_wks = 0x1; /* LLC Link Management */ nfc_llcp_build_gb(local); local->remote_miu = LLCP_DEFAULT_MIU; local->remote_lto = LLCP_DEFAULT_LTO; mutex_init(&local->sdreq_lock); INIT_HLIST_HEAD(&local->pending_sdreqs); timer_setup(&local->sdreq_timer, nfc_llcp_sdreq_timer, 0); INIT_WORK(&local->sdreq_timeout_work, nfc_llcp_sdreq_timeout_work); spin_lock(&llcp_devices_lock); list_add(&local->list, &llcp_devices); spin_unlock(&llcp_devices_lock); return 0; } void nfc_llcp_unregister_device(struct nfc_dev *dev) { struct nfc_llcp_local *local = nfc_llcp_remove_local(dev); if (local == NULL) { pr_debug("No such device\n"); return; } local_cleanup(local); nfc_llcp_local_put(local); } int __init nfc_llcp_init(void) { return nfc_llcp_sock_init(); } void nfc_llcp_exit(void) { nfc_llcp_sock_exit(); }
2 2 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB /* * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved. * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved. */ #include <linux/skbuff.h> #include "rxe.h" #include "rxe_loc.h" #include "rxe_queue.h" static char *resp_state_name[] = { [RESPST_NONE] = "NONE", [RESPST_GET_REQ] = "GET_REQ", [RESPST_CHK_PSN] = "CHK_PSN", [RESPST_CHK_OP_SEQ] = "CHK_OP_SEQ", [RESPST_CHK_OP_VALID] = "CHK_OP_VALID", [RESPST_CHK_RESOURCE] = "CHK_RESOURCE", [RESPST_CHK_LENGTH] = "CHK_LENGTH", [RESPST_CHK_RKEY] = "CHK_RKEY", [RESPST_EXECUTE] = "EXECUTE", [RESPST_READ_REPLY] = "READ_REPLY", [RESPST_ATOMIC_REPLY] = "ATOMIC_REPLY", [RESPST_ATOMIC_WRITE_REPLY] = "ATOMIC_WRITE_REPLY", [RESPST_PROCESS_FLUSH] = "PROCESS_FLUSH", [RESPST_COMPLETE] = "COMPLETE", [RESPST_ACKNOWLEDGE] = "ACKNOWLEDGE", [RESPST_CLEANUP] = "CLEANUP", [RESPST_DUPLICATE_REQUEST] = "DUPLICATE_REQUEST", [RESPST_ERR_MALFORMED_WQE] = "ERR_MALFORMED_WQE", [RESPST_ERR_UNSUPPORTED_OPCODE] = "ERR_UNSUPPORTED_OPCODE", [RESPST_ERR_MISALIGNED_ATOMIC] = "ERR_MISALIGNED_ATOMIC", [RESPST_ERR_PSN_OUT_OF_SEQ] = "ERR_PSN_OUT_OF_SEQ", [RESPST_ERR_MISSING_OPCODE_FIRST] = "ERR_MISSING_OPCODE_FIRST", [RESPST_ERR_MISSING_OPCODE_LAST_C] = "ERR_MISSING_OPCODE_LAST_C", [RESPST_ERR_MISSING_OPCODE_LAST_D1E] = "ERR_MISSING_OPCODE_LAST_D1E", [RESPST_ERR_TOO_MANY_RDMA_ATM_REQ] = "ERR_TOO_MANY_RDMA_ATM_REQ", [RESPST_ERR_RNR] = "ERR_RNR", [RESPST_ERR_RKEY_VIOLATION] = "ERR_RKEY_VIOLATION", [RESPST_ERR_INVALIDATE_RKEY] = "ERR_INVALIDATE_RKEY_VIOLATION", [RESPST_ERR_LENGTH] = "ERR_LENGTH", [RESPST_ERR_CQ_OVERFLOW] = "ERR_CQ_OVERFLOW", [RESPST_ERROR] = "ERROR", [RESPST_DONE] = "DONE", [RESPST_EXIT] = "EXIT", }; /* rxe_recv calls here to add a request packet to the input queue */ void rxe_resp_queue_pkt(struct rxe_qp *qp, struct sk_buff *skb) { skb_queue_tail(&qp->req_pkts, skb); rxe_sched_task(&qp->recv_task); } static inline enum resp_states get_req(struct rxe_qp *qp, struct rxe_pkt_info **pkt_p) { struct sk_buff *skb; skb = skb_peek(&qp->req_pkts); if (!skb) return RESPST_EXIT; *pkt_p = SKB_TO_PKT(skb); return (qp->resp.res) ? RESPST_READ_REPLY : RESPST_CHK_PSN; } static enum resp_states check_psn(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { int diff = psn_compare(pkt->psn, qp->resp.psn); struct rxe_dev *rxe = to_rdev(qp->ibqp.device); switch (qp_type(qp)) { case IB_QPT_RC: if (diff > 0) { if (qp->resp.sent_psn_nak) return RESPST_CLEANUP; qp->resp.sent_psn_nak = 1; rxe_counter_inc(rxe, RXE_CNT_OUT_OF_SEQ_REQ); return RESPST_ERR_PSN_OUT_OF_SEQ; } else if (diff < 0) { rxe_counter_inc(rxe, RXE_CNT_DUP_REQ); return RESPST_DUPLICATE_REQUEST; } if (qp->resp.sent_psn_nak) qp->resp.sent_psn_nak = 0; break; case IB_QPT_UC: if (qp->resp.drop_msg || diff != 0) { if (pkt->mask & RXE_START_MASK) { qp->resp.drop_msg = 0; return RESPST_CHK_OP_SEQ; } qp->resp.drop_msg = 1; return RESPST_CLEANUP; } break; default: break; } return RESPST_CHK_OP_SEQ; } static enum resp_states check_op_seq(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { switch (qp_type(qp)) { case IB_QPT_RC: switch (qp->resp.opcode) { case IB_OPCODE_RC_SEND_FIRST: case IB_OPCODE_RC_SEND_MIDDLE: switch (pkt->opcode) { case IB_OPCODE_RC_SEND_MIDDLE: case IB_OPCODE_RC_SEND_LAST: case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE: case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE: return RESPST_CHK_OP_VALID; default: return RESPST_ERR_MISSING_OPCODE_LAST_C; } case IB_OPCODE_RC_RDMA_WRITE_FIRST: case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: switch (pkt->opcode) { case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: case IB_OPCODE_RC_RDMA_WRITE_LAST: case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE: return RESPST_CHK_OP_VALID; default: return RESPST_ERR_MISSING_OPCODE_LAST_C; } default: switch (pkt->opcode) { case IB_OPCODE_RC_SEND_MIDDLE: case IB_OPCODE_RC_SEND_LAST: case IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE: case IB_OPCODE_RC_SEND_LAST_WITH_INVALIDATE: case IB_OPCODE_RC_RDMA_WRITE_MIDDLE: case IB_OPCODE_RC_RDMA_WRITE_LAST: case IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE: return RESPST_ERR_MISSING_OPCODE_FIRST; default: return RESPST_CHK_OP_VALID; } } break; case IB_QPT_UC: switch (qp->resp.opcode) { case IB_OPCODE_UC_SEND_FIRST: case IB_OPCODE_UC_SEND_MIDDLE: switch (pkt->opcode) { case IB_OPCODE_UC_SEND_MIDDLE: case IB_OPCODE_UC_SEND_LAST: case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE: return RESPST_CHK_OP_VALID; default: return RESPST_ERR_MISSING_OPCODE_LAST_D1E; } case IB_OPCODE_UC_RDMA_WRITE_FIRST: case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: switch (pkt->opcode) { case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: case IB_OPCODE_UC_RDMA_WRITE_LAST: case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE: return RESPST_CHK_OP_VALID; default: return RESPST_ERR_MISSING_OPCODE_LAST_D1E; } default: switch (pkt->opcode) { case IB_OPCODE_UC_SEND_MIDDLE: case IB_OPCODE_UC_SEND_LAST: case IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE: case IB_OPCODE_UC_RDMA_WRITE_MIDDLE: case IB_OPCODE_UC_RDMA_WRITE_LAST: case IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE: qp->resp.drop_msg = 1; return RESPST_CLEANUP; default: return RESPST_CHK_OP_VALID; } } break; default: return RESPST_CHK_OP_VALID; } } static bool check_qp_attr_access(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { if (((pkt->mask & RXE_READ_MASK) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_READ)) || ((pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) || ((pkt->mask & RXE_ATOMIC_MASK) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) return false; if (pkt->mask & RXE_FLUSH_MASK) { u32 flush_type = feth_plt(pkt); if ((flush_type & IB_FLUSH_GLOBAL && !(qp->attr.qp_access_flags & IB_ACCESS_FLUSH_GLOBAL)) || (flush_type & IB_FLUSH_PERSISTENT && !(qp->attr.qp_access_flags & IB_ACCESS_FLUSH_PERSISTENT))) return false; } return true; } static enum resp_states check_op_valid(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { switch (qp_type(qp)) { case IB_QPT_RC: if (!check_qp_attr_access(qp, pkt)) return RESPST_ERR_UNSUPPORTED_OPCODE; break; case IB_QPT_UC: if ((pkt->mask & RXE_WRITE_MASK) && !(qp->attr.qp_access_flags & IB_ACCESS_REMOTE_WRITE)) { qp->resp.drop_msg = 1; return RESPST_CLEANUP; } break; case IB_QPT_UD: case IB_QPT_GSI: break; default: WARN_ON_ONCE(1); break; } return RESPST_CHK_RESOURCE; } static enum resp_states get_srq_wqe(struct rxe_qp *qp) { struct rxe_srq *srq = qp->srq; struct rxe_queue *q = srq->rq.queue; struct rxe_recv_wqe *wqe; struct ib_event ev; unsigned int count; size_t size; unsigned long flags; if (srq->error) return RESPST_ERR_RNR; spin_lock_irqsave(&srq->rq.consumer_lock, flags); wqe = queue_head(q, QUEUE_TYPE_FROM_CLIENT); if (!wqe) { spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); return RESPST_ERR_RNR; } /* don't trust user space data */ if (unlikely(wqe->dma.num_sge > srq->rq.max_sge)) { spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); rxe_dbg_qp(qp, "invalid num_sge in SRQ entry\n"); return RESPST_ERR_MALFORMED_WQE; } size = sizeof(*wqe) + wqe->dma.num_sge*sizeof(struct rxe_sge); memcpy(&qp->resp.srq_wqe, wqe, size); qp->resp.wqe = &qp->resp.srq_wqe.wqe; queue_advance_consumer(q, QUEUE_TYPE_FROM_CLIENT); count = queue_count(q, QUEUE_TYPE_FROM_CLIENT); if (srq->limit && srq->ibsrq.event_handler && (count < srq->limit)) { srq->limit = 0; goto event; } spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); return RESPST_CHK_LENGTH; event: spin_unlock_irqrestore(&srq->rq.consumer_lock, flags); ev.device = qp->ibqp.device; ev.element.srq = qp->ibqp.srq; ev.event = IB_EVENT_SRQ_LIMIT_REACHED; srq->ibsrq.event_handler(&ev, srq->ibsrq.srq_context); return RESPST_CHK_LENGTH; } static enum resp_states check_resource(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { struct rxe_srq *srq = qp->srq; if (pkt->mask & (RXE_READ_OR_ATOMIC_MASK | RXE_ATOMIC_WRITE_MASK)) { /* it is the requesters job to not send * too many read/atomic ops, we just * recycle the responder resource queue */ if (likely(qp->attr.max_dest_rd_atomic > 0)) return RESPST_CHK_LENGTH; else return RESPST_ERR_TOO_MANY_RDMA_ATM_REQ; } if (pkt->mask & RXE_RWR_MASK) { if (srq) return get_srq_wqe(qp); qp->resp.wqe = queue_head(qp->rq.queue, QUEUE_TYPE_FROM_CLIENT); return (qp->resp.wqe) ? RESPST_CHK_LENGTH : RESPST_ERR_RNR; } return RESPST_CHK_LENGTH; } static enum resp_states rxe_resp_check_length(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { /* * See IBA C9-92 * For UD QPs we only check if the packet will fit in the * receive buffer later. For RDMA operations additional * length checks are performed in check_rkey. */ if ((qp_type(qp) == IB_QPT_GSI) || (qp_type(qp) == IB_QPT_UD)) { unsigned int payload = payload_size(pkt); unsigned int recv_buffer_len = 0; int i; for (i = 0; i < qp->resp.wqe->dma.num_sge; i++) recv_buffer_len += qp->resp.wqe->dma.sge[i].length; if (payload + sizeof(union rdma_network_hdr) > recv_buffer_len) { rxe_dbg_qp(qp, "The receive buffer is too small for this UD packet.\n"); return RESPST_ERR_LENGTH; } } if (pkt->mask & RXE_PAYLOAD_MASK && ((qp_type(qp) == IB_QPT_RC) || (qp_type(qp) == IB_QPT_UC))) { unsigned int mtu = qp->mtu; unsigned int payload = payload_size(pkt); if ((pkt->mask & RXE_START_MASK) && (pkt->mask & RXE_END_MASK)) { if (unlikely(payload > mtu)) { rxe_dbg_qp(qp, "only packet too long\n"); return RESPST_ERR_LENGTH; } } else if ((pkt->mask & RXE_START_MASK) || (pkt->mask & RXE_MIDDLE_MASK)) { if (unlikely(payload != mtu)) { rxe_dbg_qp(qp, "first or middle packet not mtu\n"); return RESPST_ERR_LENGTH; } } else if (pkt->mask & RXE_END_MASK) { if (unlikely((payload == 0) || (payload > mtu))) { rxe_dbg_qp(qp, "last packet zero or too long\n"); return RESPST_ERR_LENGTH; } } } /* See IBA C9-94 */ if (pkt->mask & RXE_RETH_MASK) { if (reth_len(pkt) > (1U << 31)) { rxe_dbg_qp(qp, "dma length too long\n"); return RESPST_ERR_LENGTH; } } if (pkt->mask & RXE_RDMA_OP_MASK) return RESPST_CHK_RKEY; else return RESPST_EXECUTE; } /* if the reth length field is zero we can assume nothing * about the rkey value and should not validate or use it. * Instead set qp->resp.rkey to 0 which is an invalid rkey * value since the minimum index part is 1. */ static void qp_resp_from_reth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { unsigned int length = reth_len(pkt); qp->resp.va = reth_va(pkt); qp->resp.offset = 0; qp->resp.resid = length; qp->resp.length = length; if (pkt->mask & RXE_READ_OR_WRITE_MASK && length == 0) qp->resp.rkey = 0; else qp->resp.rkey = reth_rkey(pkt); } static void qp_resp_from_atmeth(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { qp->resp.va = atmeth_va(pkt); qp->resp.offset = 0; qp->resp.rkey = atmeth_rkey(pkt); qp->resp.resid = sizeof(u64); } /* resolve the packet rkey to qp->resp.mr or set qp->resp.mr to NULL * if an invalid rkey is received or the rdma length is zero. For middle * or last packets use the stored value of mr. */ static enum resp_states check_rkey(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { struct rxe_mr *mr = NULL; struct rxe_mw *mw = NULL; u64 va; u32 rkey; u32 resid; u32 pktlen; int mtu = qp->mtu; enum resp_states state; int access = 0; /* parse RETH or ATMETH header for first/only packets * for va, length, rkey, etc. or use current value for * middle/last packets. */ if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { if (pkt->mask & RXE_RETH_MASK) qp_resp_from_reth(qp, pkt); access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ : IB_ACCESS_REMOTE_WRITE; } else if (pkt->mask & RXE_FLUSH_MASK) { u32 flush_type = feth_plt(pkt); if (pkt->mask & RXE_RETH_MASK) qp_resp_from_reth(qp, pkt); if (flush_type & IB_FLUSH_GLOBAL) access |= IB_ACCESS_FLUSH_GLOBAL; if (flush_type & IB_FLUSH_PERSISTENT) access |= IB_ACCESS_FLUSH_PERSISTENT; } else if (pkt->mask & RXE_ATOMIC_MASK) { qp_resp_from_atmeth(qp, pkt); access = IB_ACCESS_REMOTE_ATOMIC; } else { /* shouldn't happen */ WARN_ON(1); } /* A zero-byte read or write op is not required to * set an addr or rkey. See C9-88 */ if ((pkt->mask & RXE_READ_OR_WRITE_MASK) && (pkt->mask & RXE_RETH_MASK) && reth_len(pkt) == 0) { qp->resp.mr = NULL; return RESPST_EXECUTE; } va = qp->resp.va; rkey = qp->resp.rkey; resid = qp->resp.resid; pktlen = payload_size(pkt); if (rkey_is_mw(rkey)) { mw = rxe_lookup_mw(qp, access, rkey); if (!mw) { rxe_dbg_qp(qp, "no MW matches rkey %#x\n", rkey); state = RESPST_ERR_RKEY_VIOLATION; goto err; } mr = mw->mr; if (!mr) { rxe_dbg_qp(qp, "MW doesn't have an MR\n"); state = RESPST_ERR_RKEY_VIOLATION; goto err; } if (mw->access & IB_ZERO_BASED) qp->resp.offset = mw->addr; rxe_get(mr); rxe_put(mw); mw = NULL; } else { mr = lookup_mr(qp->pd, access, rkey, RXE_LOOKUP_REMOTE); if (!mr) { rxe_dbg_qp(qp, "no MR matches rkey %#x\n", rkey); state = RESPST_ERR_RKEY_VIOLATION; goto err; } } if (pkt->mask & RXE_FLUSH_MASK) { /* FLUSH MR may not set va or resid * no need to check range since we will flush whole mr */ if (feth_sel(pkt) == IB_FLUSH_MR) goto skip_check_range; } if (mr_check_range(mr, va + qp->resp.offset, resid)) { state = RESPST_ERR_RKEY_VIOLATION; goto err; } skip_check_range: if (pkt->mask & (RXE_WRITE_MASK | RXE_ATOMIC_WRITE_MASK)) { if (resid > mtu) { if (pktlen != mtu || bth_pad(pkt)) { state = RESPST_ERR_LENGTH; goto err; } } else { if (pktlen != resid) { state = RESPST_ERR_LENGTH; goto err; } if ((bth_pad(pkt) != (0x3 & (-resid)))) { /* This case may not be exactly that * but nothing else fits. */ state = RESPST_ERR_LENGTH; goto err; } } } WARN_ON_ONCE(qp->resp.mr); qp->resp.mr = mr; return RESPST_EXECUTE; err: qp->resp.mr = NULL; if (mr) rxe_put(mr); if (mw) rxe_put(mw); return state; } static enum resp_states send_data_in(struct rxe_qp *qp, void *data_addr, int data_len) { int err; err = copy_data(qp->pd, IB_ACCESS_LOCAL_WRITE, &qp->resp.wqe->dma, data_addr, data_len, RXE_TO_MR_OBJ); if (unlikely(err)) return (err == -ENOSPC) ? RESPST_ERR_LENGTH : RESPST_ERR_MALFORMED_WQE; return RESPST_NONE; } static enum resp_states write_data_in(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { enum resp_states rc = RESPST_NONE; int err; int data_len = payload_size(pkt); err = rxe_mr_copy(qp->resp.mr, qp->resp.va + qp->resp.offset, payload_addr(pkt), data_len, RXE_TO_MR_OBJ); if (err) { rc = RESPST_ERR_RKEY_VIOLATION;